## Process Web Data

In [23]:
import pandas as pd

file_path = 'Web Data/europe_west.United_Kingdom.eng.1.IN.gz'
def separate_data(file_path, out_path):
    df = pd.read_csv(file_path, compression='gzip', header=0, sep='\t', quotechar='"')
    df[['index', 'Date', 'Domain', 'N_Words', 'Text']] = df[',Date,Domain,N_Words,Text,Language,Score,Prediction'].str.split(',', n=4, expand=True)
    df[['Text', 'Language', 'Score', 'Prediction']] = df['Text'].str.rsplit(',', n=3, expand=True)
    df = df[['Text']]
    df.to_csv(out_path, index=False, compression='gzip')

In [25]:
file_paths = ['Web Data/europe_east.Poland.pol.6.IN.gz', 'Web Data/europe_west.Finland.fin.3.IN.gz', 'Web Data/europe_west.Greece.ell.4.IN.gz', 'Web Data/europe_west.Portugal.por.2.IN.gz', 'Web Data/europe_west.United_Kingdom.eng.1.IN.gz']
out_paths = ['Experiment Data/Web/PL/PL_Web.gz', 'Experiment Data/Web/FI/FI_Web.gz', 'Experiment Data/Web/EL/EL_Web.gz', 'Experiment Data/Web/PT/PT_Web.gz', 'Experiment Data/Web/EN/EN_Web.gz']
for file, out in zip(file_paths, out_paths):
    separate_data(file, out)

## Process TV Data

In [51]:
import srt
import re
import pandas as pd

def clean_subtitle_text(text):
    # Remove substrings wrapped with <> or {}
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\{.*?\}', '', text)
    text = re.sub(r'♪.*?♪', '', text)
    # Remove '- ' or ' -'
    text = text.replace(' - ', '').replace('- ', '').replace(' -', '')
    text = text.replace(' \" ', '').replace('\" ', '').replace(' \"', '').replace('\"', '')
    text = text.replace('♪ ', '').replace(' ♪', '')
    # Add space before and after punctuation marks
    text = re.sub(r'([.,!?()])', r' \1 ', text)
    # Remove multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)
    return text

def srt_to_txt(srt_files, csv_file):
    subtitles = []
    for srt_file in srt_files:
        with open(srt_file, 'r', encoding='utf-8') as f_in:
            subs = list(srt.parse(f_in.read()))
            for sub in subs:
                new = clean_subtitle_text(sub.content)
                if new != '':
                    subtitles.append(clean_subtitle_text(sub.content))
    df = pd.DataFrame(subtitles, columns=['Text'])
    concat_rows = [' '.join(df['Text'][i:i+10]) for i in range(0, len(df), 10)]
    # Create new DataFrame
    new_df = pd.DataFrame({'Text': concat_rows})
    new_df.to_csv(csv_file, index=False, compression='gzip')

In [52]:
import glob

for input, output in zip(['ENG', 'FIN', 'GRE', 'POR', 'POL'], ['EN', 'FI', 'EL', 'PT', 'PL']):
    srt_files = glob.glob(f'Reality TV/*{input}.srt')
    output_file = f'Experiment Data/TV/{output}/{output}_TV.gz'
    srt_to_txt(srt_files, output_file)
    print(srt_files)
    print(output_file)

['Reality TV\\S1E1-ENG.srt', 'Reality TV\\S1E2-ENG.srt', 'Reality TV\\S1E3-ENG.srt', 'Reality TV\\S1E4-ENG.srt', 'Reality TV\\S1E5-ENG.srt', 'Reality TV\\S1E6-ENG.srt', 'Reality TV\\S1E7-ENG.srt', 'Reality TV\\S1E8-ENG.srt']
Experiment Data/TV/EN/EN_TV.gz
['Reality TV\\S1E1-FIN.srt', 'Reality TV\\S1E2-FIN.srt', 'Reality TV\\S1E3-FIN.srt', 'Reality TV\\S1E4-FIN.srt', 'Reality TV\\S1E5-FIN.srt', 'Reality TV\\S1E6-FIN.srt', 'Reality TV\\S1E7-FIN.srt', 'Reality TV\\S1E8-FIN.srt']
Experiment Data/TV/FI/FI_TV.gz
['Reality TV\\S1E1-GRE.srt', 'Reality TV\\S1E2-GRE.srt', 'Reality TV\\S1E3-GRE.srt', 'Reality TV\\S1E4-GRE.srt', 'Reality TV\\S1E5-GRE.srt', 'Reality TV\\S1E6-GRE.srt', 'Reality TV\\S1E7-GRE.srt']
Experiment Data/TV/EL/EL_TV.gz
['Reality TV\\S1E1-POR.srt', 'Reality TV\\S1E2-POR.srt', 'Reality TV\\S1E3-POR.srt', 'Reality TV\\S1E4-POR.srt', 'Reality TV\\S1E5-POR.srt', 'Reality TV\\S1E6-POR.srt', 'Reality TV\\S1E7-POR.srt', 'Reality TV\\S1E8-POR.srt']
Experiment Data/TV/PT/PT_TV.gz
['Re

## Process Legal Data

In [44]:
from datasets import load_dataset
import re

def clean_text(text):
    # Remove new line characters
    text = text.replace('\n', ' ')
    # Add space before punctuation marks
    text = re.sub(r'([.,!?()])', r' \1 ', text)
    # Remove multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)
    return text

def process_law_data(lang, output_file):
    # Load the dataset
    dataset = load_dataset('multi_eurlex', language=lang, trust_remote_code=True)

    # Convert to pandas DataFrame
    df = pd.concat([
        pd.DataFrame(dataset['train']),
        pd.DataFrame(dataset['test']),
        pd.DataFrame(dataset['validation'])
    ])['text'].apply(clean_text).to_frame()

    df.columns.values[0] = 'Text'
    print(df.head())

    df.to_csv(output_file, index=False, compression='gzip')

In [45]:
for i in ['EN', 'FI', 'EL', 'PT', 'PL']:
    process_law_data(i.lower(), f'Experiment Data/Legal/{i}/{i}_Law.gz')

                                                Text
0  COMMISSION DECISION of 6 March 2006 establishi...
1  Commission Regulation ( EC ) No 1330/2003 of 2...
2  Council Regulation ( EC ) No 1786/2003 of 29 S...
3  ***** COMMISSION REGULATION ( EEC ) No 2590/85...
4  COMMISSION REGULATION ( EEC ) No 1103/93 of 30...
                                                Text
0  KOMISSION PÄÄTÖS , tehty 6 päivänä maaliskuuta...
1  Komission asetus ( EY ) N:o 1330/2003 , annett...
2  Neuvoston asetus ( EY ) N:o 1786/2003 , annett...
3  KOMISSION ASETUS ( ETY ) N:o 2590/85 , annettu...
4  KOMISSION ASETUS ( EY ) N:o 1038/2004 , annett...
                                                Text
0  ΑΠΌΦΑΣΗ ΤΗΣ ΕΠΙΤΡΟΠΉΣ της 6ης Μαρτίου 2006 για...
1  Κανονισμός ( ΕΚ ) αριθ . 1330/2003 της Επιτροπ...
2  Κανονισμός ( ΕΚ ) αριθ . 1786/2003 του Συμβουλ...
3  ***** ΚΑΝΟΝΙΣΜΟΣ ( ΕΟΚ ) αριθ . 2590/85 ΤΗΣ ΕΠ...
4  ΚΑΝΟΝΙΣΜΟΣ ( ΕΟΚ ) αριθ . 1103/93 ΤΗΣ ΕΠΙΤΡΟΠΗ...
                                              