In [1]:
def clean_text_file(input_path, output_path):
    """
    Processes a text file to remove everything after the second tab on each line.
    
    Args:
    input_path (str): The path to the input file.
    output_path (str): The path to the output file where the cleaned lines will be saved.
    """
    with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
        for line in infile:
            parts = line.split('\t')
            if len(parts) > 1:
                outfile.write(parts[0] + '\t' + parts[1] + '\n')
            else:
                outfile.write(line)

In [None]:
clean_text_file('./data/fin.txt', './data/eng-fin.txt')

In [None]:
import os

def clean_directory(input_dir, output_dir):
    """
    Processes all .txt files in the input directory, removes everything after the second tab on each line,
    and saves the results in the output directory. It skips files that already have an output.
    
    Args:
    input_dir (str): The directory containing the input .txt files.
    output_dir (str): The directory to save the output .txt files.
    """
    # Make sure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # List all files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith('.txt'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, f"eng-{filename}")
            
            # Skip processing if output file already exists
            if os.path.exists(output_path):
                continue
            
            # Process the file if the output does not exist
            with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
                for line in infile:
                    parts = line.split('\t')
                    if len(parts) > 1:
                        outfile.write(parts[0] + '\t' + parts[1] + '\n')
                    else:
                        outfile.write(line)

In [None]:
clean_directory('./raw/', './data/')

In [1]:
import pandas as pd
def csv_to_sentence_pairs_pandas_kin(input_csv_path, output_txt_path):

    df = pd.read_csv(input_csv_path, on_bad_lines='skip')
    
    # Ensure the DataFrame has at least two columns
    if df.shape[1] < 3:
        raise ValueError("CSV does not have enough columns.")

    # Select only the second (Kinyarwanda) and third (English) columns
    df = df.iloc[:, [2, 1]]
    df = df.dropna()

    # Combine the columns into a single column separated by a tab
    df['sentence_pairs'] = df.iloc[:, 0].astype(str).str.strip() + '\t' + df.iloc[:, 1].astype(str).str.strip()

    # Remove any quotations from the combined sentence pairs
    df['sentence_pairs'] = df['sentence_pairs'].replace('"', '', regex=False)

    # Write the combined and cleaned sentences to a text file
    df['sentence_pairs'].to_csv(output_txt_path, index=False, header=False, encoding='utf-8')


In [2]:
csv_to_sentence_pairs_pandas_kin('./raw/kinyarwanda.csv', './data/eng-kin.txt')

In [None]:
def remove_quotations(input_file_path, output_file_path):
    # Read the original file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Replace all quotation marks
    modified_content = content.replace('"', '')
    
    # Write the modified content back to a new file
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(modified_content)

# Usage example
remove_quotations('input.txt', 'output.txt')


In [7]:
def remove_invalid_tab_delimited_entries(input_file_path, output_file_path):
    valid_rows = []
    total_rows = 0
    invalid_rows = 0

    with open(input_file_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, start=1):
            # Split line by tab character
            entries = line.strip().split('\t')
            
            # Check if there are at least two entries
            if len(entries) >= 2:
                valid_rows.append(line)
            else:
                invalid_rows += 1
                print(f"Invalid row {line_number} removed: {line.strip()}")
            
            total_rows += 1

    # Write valid rows to a new or the same file
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.writelines(valid_rows)

    # Summary of the process
    print(f"Total rows checked: {total_rows}")
    print(f"Valid rows written to the new file: {len(valid_rows)}")
    print(f"Invalid rows removed: {invalid_rows}")

# Usage example
remove_invalid_tab_delimited_entries('input.txt', 'output.txt')


FileNotFoundError: [Errno 2] No such file or directory: 'input.txt'

In [5]:
import os
import csv

def aggregate_translations(directory_path, output_file):
    # Initialize an empty list to store the translations
    translations = []

    # Iterate through every file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            # Construct the full file path
            file_path = os.path.join(directory_path, filename)
            
            # Open the CSV file
            with open(file_path, newline='', encoding='utf-8') as csvfile:
                reader = csv.reader(csvfile)
                
                # Skip the header
                next(reader, None)
                
                # Extract the second and third columns
                for row in reader:
                    if len(row) >= 3:  # Check if there are at least three columns
                        english = row[1]  # Second column (English)
                        kirundi = row[2]  # Third column (Kirundi)
                        translations.append(f"{english}\t{kirundi}")

    # Write the aggregated translations to a text file
    with open(output_file, 'w', encoding='utf-8') as f:
        for translation in translations:
            f.write(f"{translation}\n")

# Usage
directory_path = './raw/bible/'
output_file = 'aggregated_translations.txt'
aggregate_translations(directory_path, output_file)


In [10]:
remove_invalid_tab_delimited_entries('./aggregated_translations.txt', 'aggregate_translations_clean.txt')

Invalid row 943 removed: 
Invalid row 944 removed: 
Invalid row 945 removed: 
Invalid row 946 removed: 
Invalid row 947 removed: 
Invalid row 948 removed: 
Invalid row 949 removed: 
Invalid row 950 removed: 
Invalid row 951 removed: 
Invalid row 952 removed: 
Invalid row 953 removed: 
Invalid row 954 removed: 
Invalid row 955 removed: 
Invalid row 956 removed: 
Invalid row 957 removed: 
Invalid row 958 removed: 
Invalid row 959 removed: 
Invalid row 960 removed: 
Invalid row 961 removed: 
Invalid row 2676 removed: 
Invalid row 3260 removed: 
Invalid row 3261 removed: 
Invalid row 4078 removed: 
Invalid row 4079 removed: 
Invalid row 4080 removed: 
Invalid row 4081 removed: 
Invalid row 4082 removed: 
Invalid row 4083 removed: 
Invalid row 4084 removed: 
Invalid row 4085 removed: 
Invalid row 4086 removed: 
Invalid row 4087 removed: 
Invalid row 4088 removed: 
Invalid row 4089 removed: 
Invalid row 4090 removed: 
Invalid row 4091 removed: 
Invalid row 4092 removed: 
Invalid row 4093 rem

In [11]:
import random

def split_data(input_file, train_file, test_file, test_ratio=0.25):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Shuffle the lines to ensure random sampling
    random.shuffle(lines)
    
    # Calculate the number of lines for the test set
    num_test = int(len(lines) * test_ratio)
    
    # Split the data
    test_lines = lines[:num_test]
    train_lines = lines[num_test:]
    
    # Write the test set to a new file
    with open(test_file, 'w', encoding='utf-8') as file:
        file.writelines(test_lines)
    
    # Write the remaining data back to the original file as the training set
    with open(train_file, 'w', encoding='utf-8') as file:
        file.writelines(train_lines)

# Usage
input_file = './aggregate_translations_clean.txt'
train_file = './data/eng-kir.txt'
test_file = './data/eng-kir_test.txt'
split_data(input_file, train_file, test_file)