In [18]:
import os
import pandas as pd
import numpy as np

language_data= pd.read_csv("../data/languages.csv", sep=";")

sources_data = {} 
for _, row in language_data.iterrows():
    code = row["Code"]
    sources_data[code] = {}

    file_path_cv = f"../data/cv/{code}-word.csv"
    if os.path.exists(file_path_cv):
        sources_data[code]["cv"] = pd.read_csv(file_path_cv)

    file_path_pud = f"../data/pud/{code}_pud.csv"
    if os.path.exists(file_path_pud):
        sources_data[code]["pud"] = pd.read_csv(file_path_pud)
    
    file_path_pud_pinyin = f"../data/pud/{code}_pud_pinyin.csv"
    if os.path.exists(file_path_pud_pinyin):
        sources_data[code]["pud_pinyin"] = pd.read_csv(file_path_pud_pinyin)
    
    file_path_pud_strokes = f"../data/pud/{code}_pud_strokes.csv"
    if os.path.exists(file_path_pud_strokes):
        sources_data[code]["pud_strokes"] = pd.read_csv(file_path_pud_strokes)

def preprocess_cv(code, data):
    processed_data = pd.DataFrame() 

    total_frequency = data['frequency'].sum()

    processed_data['Word'] = data['word']
    processed_data['Length of characters'] = data['n_characters']
    processed_data['Median duration'] = data['medianDuration']
    processed_data['Frequency rank'] = data['frequency'].rank(method='max', ascending=False).astype(int)
    processed_data['Relative frequency'] = data['frequency'] / total_frequency

    processed_data = processed_data.sort_values(by='Frequency rank', ascending=True)

    processed_data.to_csv(f"../data/preprocessed/{code}_cv.csv", index=False)

def preprocess_pud(code, data):
    processed_data = pd.DataFrame() 

    total_frequency = data['frequency'].sum()

    processed_data['Word'] = data['word']
    processed_data['Length of characters'] = data['n_characters']
    processed_data['Frequency rank'] = data['frequency'].rank(method='max', ascending=False).astype(int)
    processed_data['Relative frequency'] = data['frequency'] / total_frequency

    processed_data = processed_data.sort_values(by='Frequency rank', ascending=True)

    processed_data.to_csv(f"../data/preprocessed/{code}_pud.csv", index=False)

for code, sources in sources_data.items():
    print(f"Processing: {code}")
    for source, data in sources.items():
        if source == "cv":
            preprocess_cv(code, data)
        elif source.startswith("pud"):
            pud_code = code
            if source != "pud":
                pud_code = f"{code}_{source.replace('pud_', '')}"
            preprocess_pud(pud_code, data)


Processing: eng
Processing: spa
Processing: cat
Processing: ara
Processing: ind
Processing: tur
Processing: zho
Processing: tam
Processing: eus
