# Prepare the training data to fine-tune the foreign language prediction

In [1]:
import pandas as pd
from os.path import join
import re
from sklearn.model_selection import StratifiedShuffleSplit

In [7]:
data_dir = "../../../data"

In [8]:
label_map = {
    "[STYLE]": {
        "info":0,
        "opin":0,
        "quest":0,
        "conseq":0,
        "correct":0,
        "inconsist":0,
        "sarc":0,
        "insult-pers":0,
        "insult-ism":0,
        "insult-polit":0,
        "insult-inst":0,
        "other":0,
        "unint":0,
        "foreign":1
    },
}

In [10]:
src = "labelled_samples_with_ids"
fname = "batch_{}_{}.csv"
category = "[STYLE]"
raters_initial_batch = ["AS", "LT"]
initial_batch = 1
raters_additional_batches = ["AH", "LT"]
additional_batches = [3, 4, 5]

data = {}
for initial_rater in raters_initial_batch:
    print(f"initial rater: {initial_rater}")
    # load the initial batch labelled by two raters
    df = pd.read_csv(join(data_dir, src, fname.format(initial_batch, initial_rater)),
                     delimiter=";")
    N = len(df)
    df = df[["text", category]]
    df = df.dropna()
    print(f"initial batch dropped {N - len(df)} NA entries")

    # load additional batches and add them to the datasets
    for additional_batch in additional_batches:
        for rater in raters_additional_batches:
            tmp = pd.read_csv(join(data_dir, src, fname.format(additional_batch, rater)),
                              delimiter=";")
            N = len(tmp)
            tmp = tmp[["text", category]]
            tmp = tmp.dropna()
            print(f"additional batch {additional_batch} rater {rater}: dropped {N - len(tmp)} NA entries")
            df = pd.concat([df, tmp])
            
    df = df.reset_index(drop=True)
    
    # hardcoded label conversion
    label_to_id = label_map[category]
    id_to_label = {val:key for key, val in label_to_id.items()}
    df["label"] = [label_to_id[label] for label in df[category]]
    df = df.drop(columns=[category])
    data[initial_rater] = df


initial rater: AS
initial batch dropped 0 NA entries
additional batch 3 rater AH: dropped 0 NA entries
additional batch 3 rater LT: dropped 0 NA entries
additional batch 4 rater AH: dropped 0 NA entries
additional batch 4 rater LT: dropped 0 NA entries
additional batch 5 rater AH: dropped 0 NA entries
additional batch 5 rater LT: dropped 0 NA entries
initial rater: LT
initial batch dropped 0 NA entries
additional batch 3 rater AH: dropped 0 NA entries
additional batch 3 rater LT: dropped 0 NA entries
additional batch 4 rater AH: dropped 0 NA entries
additional batch 4 rater LT: dropped 0 NA entries
additional batch 5 rater AH: dropped 0 NA entries
additional batch 5 rater LT: dropped 0 NA entries


In [11]:
# clean text
for rater in ["AS", "LT"]:
    # remove only URLs
    data[rater]["text"] = data[rater]["text"]\
        .apply(lambda x: re.sub(r"https?:\/\/\S*", "", x, flags=re.MULTILINE))

    # lowercase all text
    data[rater]["text"] = data[rater]["text"]\
        .apply(lambda x: x.lower())


In [16]:
test_frac = 0.15
eval_frac = 0.15

for rater in raters_initial_batch:
    for s, seed in enumerate([42, 43, 44, 45, 46]):
        # get the eval data
        sss = StratifiedShuffleSplit(
            n_splits=1, 
            test_size=eval_frac, 
            random_state=seed
        )
        sss.get_n_splits(data[rater]["text"], data[rater]["label"])
        for tmp_index, eval_index in sss.split(data[rater]["text"],
                                                 data[rater]["label"]):
            X_tmp, X_eval = data[rater]["text"].loc[tmp_index],\
                              data[rater]["text"].loc[eval_index]
            y_tmp, y_eval = data[rater]["label"].loc[tmp_index],\
                              data[rater]["label"].loc[eval_index]

        tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
        evaldata = pd.concat([X_eval, y_eval], axis=1)

        # get the test data
        sss = StratifiedShuffleSplit(
            n_splits=1, 
            test_size=test_frac, 
            random_state=s + 10
        )
        sss.get_n_splits(tmp["text"], tmp["label"])
        for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
            X_train, X_test = tmp["text"].loc[train_index],\
                              tmp["text"].loc[test_index]
            y_train, y_test = tmp["label"].loc[train_index],\
                              tmp["label"].loc[test_index]

            traindata = pd.concat([X_train, y_train], axis=1)
            testdata = pd.concat([X_eval, y_eval], axis=1)

            tmp_fname = fname.split(".")[0]
            batch = '1'
            for b in additional_batches:
                batch += f'+{b}'

        dst = join(data_dir, "traindata", "foreign")
        traindata.to_csv(join(dst, tmp_fname.format(batch, rater) + f"_train_{s+1}.csv"), index=False)
        testdata.to_csv(join(dst, tmp_fname.format(batch, rater) + f"_test_{s+1}.csv"), index=False)
        evaldata.to_csv(join(dst, tmp_fname.format(batch, rater) + f"_eval_{s+1}.csv"), index=False)