In [None]:
"""This script is used to clean the data collected from the web scraping script.
It will remove the time in the beginning of each line.
The pair the English text and French text together.
It will output the cleaned data to a new csvfile."""

In [28]:
import xml.etree.ElementTree as ET
import csv
import glob
import re

In [29]:
def parse_xml(file_path):
    # parse the xml file and return a list of texts
    tree = ET.parse(file_path)
    root = tree.getroot()
    texts = []
    for section in root.findall("section"):
        text = section.get("text")
        # remove the time in the beginning of each line with regex
        cleaned_text = re.sub(r"^\d{2}:\d{2}", "", text).strip()
        texts.append(cleaned_text)
    return texts

In [30]:
def write_to_csv(all_texts, output_file):
    # write the cleaned data to a new csv file
    with open(output_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["en_text", "fr_text"])
        for en_texts, fr_texts in all_texts:
            for en_text, fr_text in zip(en_texts, fr_texts):
                writer.writerow([en_text, fr_text])

In [31]:
# collect all the file paths
en_files = sorted(glob.glob("english-*.xml"))
fr_files = sorted(glob.glob("french-*.xml"))

# check if the number of files are the same
assert len(en_files) == len(fr_files), "The number of files are not the same."

# parse the xml files
all_texts = []
for en_file, fr_file in zip(en_files, fr_files):
    en_texts = parse_xml(en_file)
    fr_texts = parse_xml(fr_file)
    all_texts.append((en_texts, fr_texts))

In [33]:
# write the cleaned data to a new csv file
output_file = "../data/raw/subtitles_combined.csv"

write_to_csv(all_texts, output_file)