In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit

import data_preparation_resources as dpr

# Label condensation stages

In [None]:
dst = "../../../data/traindata"

In [None]:
# can be "none", "medium" and "full"
label_condensation = "full"
# only here to be consistent with other scripts that do have label condensation
condensation_dataset_names = {"none":"", "full":"_condensed"}

In [None]:
# label dictionary after removing "foreign"
label_to_id = {
    "yes":0,
    "no":1,
    "unint":2,
    }

# condensed label dictionaries
if label_condensation == "none":
    label_to_condensed_id = label_to_id
    condensed_id_to_label = {val:key for key, val in label_to_id.items()}
elif label_condensation == "full":
    label_to_condensed_id = {
        "yes":0,
        "no":1,
        "unint":1
    }
    condensed_id_to_label = {
        0:"yes",
        1:"no"
    }
else:
    print("unknown condensation level!")

# V1: dataset with confident labels

## Load confident human labels

In [None]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[SPEECH][hate]"
fname = "confident_examples_hate{}"\
    .format(condensation_dataset_names[label_condensation])
cols = ["tweet_id", "text", dimension]
confident_examples = pd.DataFrame()
for pair in dpr.label_pairs:
    df1 = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_1"})
    df2 = pd.read_csv(
        Path(src, pair[1] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_2"})
    
    df1 = df1[df1["label_1"] != "foreign"]
    df2 = df2[df2["label_2"] != "foreign"]
    df1["label_1"] = df1["label_1"].replace(label_to_condensed_id)
    df2["label_2"] = df2["label_2"].replace(label_to_condensed_id)
    
    shared_ids = df1[df1["tweet_id"].isin(df2["tweet_id"])]["tweet_id"].values
    df1 = df1[df1["tweet_id"].isin(shared_ids)]
    df2 = df2[df2["tweet_id"].isin(shared_ids)]
    df1 = df1.sort_values(by="tweet_id").reset_index(drop=True)
    df2 = df2.sort_values(by="tweet_id").reset_index(drop=True)
    
    df = pd.concat([df1, df2[["label_2"]]], axis=1)[["tweet_id", "text", "label_1", "label_2"]]
    df = df[df["label_1"] == df["label_2"]]
    df = df.drop(columns=["label_2"]).rename(columns={"label_1":"label"})
    confident_examples = pd.concat([confident_examples, df])
confident_examples = confident_examples.reset_index(drop=True)
confident_examples["label"] = confident_examples["label"].astype(int)
confident_examples = dpr.clean_text(confident_examples)
confident_examples.to_csv(Path(dst,fname + ".csv"), index=False, sep=";")

In [None]:
#    "yes":0,
#    "no":1,
#    "unint":2,
confident_examples["label"].value_counts()

In [None]:
len(confident_examples)

## Create training splits

In [None]:
fname = "confident_examples_hate{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_hate{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_hate* jlasse@nvcluster:/home/jlasse/GermanHass/speech-hate_analysis/data_preparation/

# V2: augmented minority class examples

## Add minority example translations

In [None]:
translations = dpr.select_translations(confident_examples, [0])

In [None]:
len(translations)

In [None]:
cols = ["tweet_id", "text", "label"]
if label_condensation == "none":
    translations = dpr.select_translations(confident_examples, [0, 2])
    translations_yes = translations[translations["label"] == 0]
    translations_unint = translations[translations["label"] == 2]
    new_confident_examples = pd.concat([
        confident_examples, 
        translations_yes[cols],
        translations_unint[cols],
    ])
elif label_condensation == "full":
    translations = dpr.select_translations(confident_examples, [0])
    new_confident_examples = pd.concat([
        confident_examples, 
        translations[cols],
    ])
else:
    print("unknown condensation level")
new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

In [None]:
fname = "confident_examples_hate_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [None]:
fname = "confident_examples_hate_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_hate_aug-trans{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_hate_aug-trans* jlasse@nvcluster:/home/jlasse/GermanHass/speech-hate_analysis/data_preparation/

In [None]:
traindata["label"].value_counts()

# V3: human + inferred labels (round 1)

## Load existing confident examples

In [None]:
fname = "confident_examples_hate_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
confident_examples = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)

## Load remaining human annotated examples

In [None]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[SPEECH][hate]"
cols = ["tweet_id", "text", dimension]
labelled = pd.DataFrame()
for pair in dpr.label_pairs:
    df = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_1"})
    
    df = df[df["label_1"] != "foreign"]
    df = df.dropna(subset=["label_1"])
    df["label_1"] = df["label_1"].apply(lambda x: x.strip(" "))
    df["label_1"] = df["label_1"].replace(label_to_condensed_id)
    labelled = pd.concat([labelled, df])
labelled = labelled.reset_index(drop=True)
labelled["label_1"] = labelled["label_1"].astype(int)
labelled = dpr.clean_text(labelled)

In [None]:
labelled["label_1"].value_counts()

In [None]:
remaining_examples = labelled[~labelled["tweet_id"].isin(confident_examples["tweet_id"])].copy()

In [None]:
len(remaining_examples)

In [None]:
fname = "remaining_examples_hate{}_1.csv"\
    .format(condensation_dataset_names[label_condensation])
remaining_examples.rename(columns={"label_1":"label"}).to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

In [None]:
! rsync -avze ssh ../../../data/traindata/remaining_examples_hate* jlasse@nvcluster:/home/jlasse/GermanHass/speech-hate_analysis/data_preparation/

## Load inferred labels

In [None]:
# python3 infer_hate.py ../best_models/model-twitter-xlm-roberta-base_germanhass_epochs-100_batchsize-64_data-confident_examples_hate_aug-trans_condensed_split-1 ../data_preparation/remaining_examples_hate_condensed_1.csv 2

In [None]:
# download inferred data
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/GermanHass/speech-hate_analysis/inference/inferred_speech-hate* ../../../data/inference/

In [None]:
fname = "inferred_hate{}_remaining_examples{}_1.csv"\
    .format(condensation_dataset_names[label_condensation],
            condensation_dataset_names[label_condensation])
inferred_labels = pd.read_csv(
    Path("../../../data/inference", fname),
    delimiter=";",
    dtype={"tweet_id":str, "speech-hate":int},
    usecols=["tweet_id", "speech-hate"]
).rename(columns={"speech-hate":"label_2"})

## Determine label agreement

In [None]:
remaining_examples = pd.merge(
    remaining_examples,
    inferred_labels,
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

In [None]:
remaining_examples = remaining_examples.dropna(subset=["label_2"])
remaining_examples["label_2"] = remaining_examples["label_2"].astype(int)

In [None]:
new_confident_examples = remaining_examples[remaining_examples["label_1"] == remaining_examples["label_2"]]
new_confident_examples = new_confident_examples[["tweet_id", "text", "label_1"]].rename(columns={"label_1":"label"})
new_confident_examples["label"].value_counts()

## Add augmented minority class examples

In [None]:
cols = ["tweet_id", "text", "label"]
if label_condensation == "none":
    translations = dpr.select_translations(new_confident_examples, [0])
    translations_yes = translations[translations["label"] == 0]
    new_confident_examples = pd.concat([
        confident_examples, 
        new_confident_examples,
        translations_yes[cols],
    ])
elif label_condensation == "full":
    translations = dpr.select_translations(new_confident_examples, [0])
    new_confident_examples = pd.concat([
        confident_examples, 
        new_confident_examples,
        translations[cols],
    ])
else:
    print("unknown label condensation!")

new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

In [None]:
fname = "confident_examples_hate_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [None]:
fname = "confident_examples_hate_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_hate_aug-trans-inferred{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_hate_aug-trans-inferred* jlasse@nvcluster:/home/jlasse/GermanHass/speech-hate_analysis/data_preparation/

# V4: human + inferred labels (round 2)

## Load existing confident examples

In [None]:
fname = "confident_examples_hate_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
confident_examples = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)

## Load remaining human annotated examples

In [None]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[SPEECH][hate]"
cols = ["tweet_id", "text", dimension]
labelled = pd.DataFrame()
for pair in dpr.label_pairs:
    df = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_1"})
    
    df = df[df["label_1"] != "foreign"]
    df = df.dropna(subset=["label_1"])
    df["label_1"] = df["label_1"].apply(lambda x: x.strip(" "))
    df["label_1"] = df["label_1"].replace(label_to_condensed_id)
    labelled = pd.concat([labelled, df])
labelled = labelled.reset_index(drop=True)
labelled["label_1"] = labelled["label_1"].astype(int)
labelled = dpr.clean_text(labelled)

In [None]:
labelled["label_1"].value_counts()

In [None]:
remaining_examples = labelled[~labelled["tweet_id"].isin(confident_examples["tweet_id"])].copy()

In [None]:
len(remaining_examples)

In [None]:
fname = "remaining_examples_hate{}_2.csv"\
    .format(condensation_dataset_names[label_condensation])
remaining_examples.rename(columns={"label_1":"label"}).to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

In [None]:
! rsync -avze ssh ../../../data/traindata/remaining_examples_hate* jlasse@nvcluster:/home/jlasse/GermanHass/hate_analysis/data_preparation/

## Load inferred labels

In [None]:
# python3 infer_hate.py ../best_models/model-twitter-xlm-roberta-base_germanhass_epochs-100_batchsize-64_data-confident_examples_hate_aug-trans-inferred_condensed_split-5 ../data_preparation/remaining_examples_hate_condensed_2.csv 2

In [None]:
# download inferred data
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/GermanHass/speech-hate_analysis/inference/inferred_speech-hate* ../../../data/inference/

In [None]:
fname = "inferred_hate{}_remaining_examples{}_2.csv"\
    .format(condensation_dataset_names[label_condensation],
            condensation_dataset_names[label_condensation])
inferred_labels = pd.read_csv(
    Path("../../../data/inference", fname),
    delimiter=";",
    dtype={"tweet_id":str, "speech-hate":int},
    usecols=["tweet_id", "speech-hate"]
).rename(columns={"speech-hate":"label_2"})

## Determine label agreement

In [None]:
remaining_examples = pd.merge(
    remaining_examples,
    inferred_labels,
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

In [None]:
remaining_examples = remaining_examples.dropna(subset=["label_2"])
remaining_examples["label_2"] = remaining_examples["label_2"].astype(int)

In [None]:
new_confident_examples = remaining_examples[remaining_examples["label_1"] == remaining_examples["label_2"]]
new_confident_examples = new_confident_examples[["tweet_id", "text", "label_1"]].rename(columns={"label_1":"label"})
new_confident_examples["label"].value_counts()

## Add augmented minority class examples

In [None]:
cols = ["tweet_id", "text", "label"]
if label_condensation == "none":
    translations = dpr.select_translations(new_confident_examples, [0])
    translations_yes = translations[translations["label"] == 0]
    new_confident_examples = pd.concat([
        confident_examples, 
        new_confident_examples,
        translations_yes[cols],
    ])
elif label_condensation == "full":
    translations = dpr.select_translations(new_confident_examples, [0])
    new_confident_examples = pd.concat([
        confident_examples, 
        new_confident_examples,
        translations[cols],
    ])
else:
    print("unknown label condensation!")

new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

In [None]:
fname = "confident_examples_hate_aug-trans-inferred2{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [None]:
fname = "confident_examples_hate_aug-trans-inferred2{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_hate_aug-trans-inferred2{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_hate_aug-trans-inferred2* jlasse@nvcluster:/home/jlasse/GermanHass/speech-hate_analysis/data_preparation/