In [1]:
import pandas as pd
import os
import re

# Define the directory containing the dataset
data_dir = "/Users/hannojacobs/Documents/Code/Transformers-Explored/Datasets/eng_afr/"

# Define file paths - Adjust these if your filenames differ
eng_file_path = os.path.join(data_dir, "eng.txt")
afr_file_path = os.path.join(data_dir, "afr.txt")

# --- Read the data ---
try:
    # Read English lines, stripping leading/trailing whitespace
    with open(eng_file_path, "r", encoding="utf-8") as f_eng:
        eng_lines = [line.strip() for line in f_eng]
    print(f"Read {len(eng_lines)} lines from {eng_file_path}")

    # Read Afrikaans lines, stripping leading/trailing whitespace
    with open(afr_file_path, "r", encoding="utf-8") as f_afr:
        afr_lines = [line.strip() for line in f_afr]
    print(f"Read {len(afr_lines)} lines from {afr_file_path}")

    # --- Data Validation ---
    if len(eng_lines) != len(afr_lines):
        raise ValueError(
            f"Line count mismatch: English ({len(eng_lines)}) vs Afrikaans ({len(afr_lines)})"
        )
    if len(eng_lines) == 0:
        raise ValueError("Files are empty or could not be read properly.")

    # --- Create DataFrame ---
    df = pd.DataFrame(
        {
            "src": eng_lines,  # Source language (English)
            "target": afr_lines,  # Target language (Afrikaans)
        }
    )

    # --- Display Info ---
    print("\nDataFrame created successfully:")
    print(df.head())
    print(f"\nTotal rows: {len(df)}")
    # print(df.info()) # Uncomment for more detailed info

except FileNotFoundError as e:
    print(f"Error: File not found. {e}")
    print(f"Please ensure the files exist at the specified paths:")
    print(f"- English: {os.path.abspath(eng_file_path)}")
    print(f"- Afrikaans: {os.path.abspath(afr_file_path)}")
except ValueError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Read 1367869 lines from /Users/hannojacobs/Documents/Code/Transformers-Explored/Datasets/eng_afr/eng.txt
Read 1367869 lines from /Users/hannojacobs/Documents/Code/Transformers-Explored/Datasets/eng_afr/afr.txt

DataFrame created successfully:
                                                 src  \
0  For the report you will be working in groups o...   
1  Sallies shareholders are referred to the annou...   
2  Discuss extended incontinence under the follow...   
3  They do , however , want to know if the primar...   
4  Glyptic art reached a peak in the Middle Assyr...   

                                              target  
0  Vir u referaat werk u in groepe saam oor 'n on...  
1  Sallies-aandeelhouers word verwys na die aanko...  
2  Bespreek langdurige inkontinensie onder die ho...  
3  Hulle stel ook belang of die primêre doelwit v...  
4  Gliptiese kuns het 'n hoogtepunt bereik tydens...  

Total rows: 1367869


In [2]:
def clean_text(text):
    # Lowercase and remove all non-a-z and non-space characters
    first = re.sub(r"[^a-z ]", "", text.lower())
    # remove double (or more) spaces
    second = re.sub(r"\s+", " ", first).strip()
    return second


df["src"] = df["src"].apply(clean_text)
df["target"] = df["target"].apply(clean_text)

print("\nAfter cleaning:")
print(f"\nTotal rows: {len(df)}")
print(df.head())


After cleaning:

Total rows: 1367869
                                                 src  \
0  for the report you will be working in groups o...   
1  sallies shareholders are referred to the annou...   
2  discuss extended incontinence under the follow...   
3  they do however want to know if the primary ob...   
4  glyptic art reached a peak in the middle assyr...   

                                              target  
0  vir u referaat werk u in groepe saam oor n ond...  
1  salliesaandeelhouers word verwys na die aankon...  
2  bespreek langdurige inkontinensie onder die ho...  
3  hulle stel ook belang of die primre doelwit va...  
4  gliptiese kuns het n hoogtepunt bereik tydens ...  


In [3]:
def count_words(text):
    return len(text.split())


NUM_WORDS_LIMIT = 10
df = df[
    (df["src"].apply(count_words) <= NUM_WORDS_LIMIT)
    & (df["target"].apply(count_words) <= NUM_WORDS_LIMIT)
].reset_index(drop=True)

print("\nAfter word limit drop cleaning:")
print(f"\nTotal rows: {len(df)}")
print(df.head())


After word limit drop cleaning:

Total rows: 526648
                                                 src  \
0  discuss extended incontinence under the follow...   
1  glyptic art reached a peak in the middle assyr...   
2                         will be discussed in class   
3                               proceedings national   
4                             occurrence of diseases   

                                              target  
0  bespreek langdurige inkontinensie onder die ho...  
1  gliptiese kuns het n hoogtepunt bereik tydens ...  
2                      sal in die klas bespreek word  
3                              verrigtinge nasionaal  
4                               voorkoms van siektes  


In [4]:
# --- Save to CSV ---
output_csv_path = os.path.join(data_dir, "eng_afr_full_rows.csv")
df.to_csv(output_csv_path, index=False, encoding="utf-8")
print(f"\nDataFrame saved to {output_csv_path}")


DataFrame saved to /Users/hannojacobs/Documents/Code/Transformers-Explored/Datasets/eng_afr/eng_afr_full.csv


In [5]:
# save short versions as well
for HOW_MANY_TO_KEEP in [100, 1000, 10000, 100000]:
    short_df = df.head(HOW_MANY_TO_KEEP)
    print(f"Number of rows: {len(short_df)}")
    print(f"Columns in the dataframe: {short_df.columns}")

    output_csv_path = os.path.join(data_dir, f"eng_afr_{HOW_MANY_TO_KEEP}_rows.csv")
    short_df.to_csv(output_csv_path, index=False, encoding="utf-8")
    print(f"Saved {HOW_MANY_TO_KEEP} rows to {output_csv_path}")

Number of rows: 100
Columns in the dataframe: Index(['src', 'target'], dtype='object')
Saved 100 rows to /Users/hannojacobs/Documents/Code/Transformers-Explored/Datasets/eng_afr/eng_afr_100_rows.csv
Number of rows: 1000
Columns in the dataframe: Index(['src', 'target'], dtype='object')
Saved 1000 rows to /Users/hannojacobs/Documents/Code/Transformers-Explored/Datasets/eng_afr/eng_afr_1000_rows.csv
Number of rows: 10000
Columns in the dataframe: Index(['src', 'target'], dtype='object')
Saved 10000 rows to /Users/hannojacobs/Documents/Code/Transformers-Explored/Datasets/eng_afr/eng_afr_10000_rows.csv
Number of rows: 100000
Columns in the dataframe: Index(['src', 'target'], dtype='object')
Saved 100000 rows to /Users/hannojacobs/Documents/Code/Transformers-Explored/Datasets/eng_afr/eng_afr_100000_rows.csv
