In [2]:
def clean_text_file(input_path, output_path):
    """
    Processes a text file to remove everything after the second tab on each line.
    
    Args:
    input_path (str): The path to the input file.
    output_path (str): The path to the output file where the cleaned lines will be saved.
    """
    with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
        for line in infile:
            parts = line.split('\t')
            if len(parts) > 1:
                outfile.write(parts[0] + '\t' + parts[1] + '\n')
            else:
                outfile.write(line)

In [3]:
clean_text_file('./data/fin.txt', './data/eng-fin.txt')

In [5]:
import os

def clean_directory(input_dir, output_dir):
    """
    Processes all .txt files in the input directory, removes everything after the second tab on each line,
    and saves the results in the output directory. It skips files that already have an output.
    
    Args:
    input_dir (str): The directory containing the input .txt files.
    output_dir (str): The directory to save the output .txt files.
    """
    # Make sure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # List all files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith('.txt'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, f"eng-{filename}")
            
            # Skip processing if output file already exists
            if os.path.exists(output_path):
                continue
            
            # Process the file if the output does not exist
            with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
                for line in infile:
                    parts = line.split('\t')
                    if len(parts) > 1:
                        outfile.write(parts[0] + '\t' + parts[1] + '\n')
                    else:
                        outfile.write(line)

In [6]:
clean_directory('./raw/', './data/')