In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit

import data_preparation_resources as dpr

# Label condensation stages

In [None]:
dst = "../../../data/traindata"

In [None]:
# can be "none", "medium" and "full"
label_condensation = "full2"
condensation_dataset_names = {
    "none":"",
    "medium":"_halfcondensed",
    "full":"_condensed",
    "full2":"_condensed2"
}

In [None]:
# label dictionary after removing "foreign"
label_to_id = {
    "strength":0,
    "just":1,
    "threat":2,
    "weak":3,
    "emph-ground":4,
    "emph-prob":5,
    "neutral":6,
    "unint":7
    }

# condensed label dictionaries
if label_condensation == "none":
    label_to_condensed_id = label_to_id
    condensed_id_to_label = {val:key for key, val in label_to_id.items()}
    id_to_condensed_id = {val:val for val in label_to_id.values()}
elif label_condensation == "medium":
    label_to_condensed_id = {
         "strength": 0, # pose
         "just": 0, 
         "threat": 1,
         "weak": 2,
         "emph-ground": 3, # emph
         "emph-prob": 3,
         "neutral": 4,
         "unint": 5
    }

    condensed_id_to_label = {
        0:"pose",
        1:"threat",
        2:"weak",
        3:"emph",
        4:"neutral",
        5:"unint"
    }

    id_to_condensed_id = {
         0: 0,
         1: 0,
         2: 1,
         3: 2,
         4: 3,
         5: 3,
         6: 4,
         7: 5
    }
elif label_condensation == "full":
    label_to_condensed_id = {
         "strength": 2, # pose
         "just": 2, 
         "threat": 2,
         "weak": 0,
         "emph-ground": 2, # emph
         "emph-prob": 2,
         "neutral": 1,
         "unint": 0
    }

    condensed_id_to_label = {
        0:"weak",
        1:"neutral",
        2:"other",
    }

    id_to_condensed_id = {
         0: 2,
         1: 2,
         2: 2,
         3: 0,
         4: 2,
         5: 2,
         6: 1,
         7: 0
    }
elif label_condensation == "full2":
    label_to_condensed_id = {
         "strength": 0, # pose
         "just": 0, 
         "threat": 1,
         "weak": 1,
         "emph-ground": 0, # emph
         "emph-prob": 0,
         "neutral": 2,
         "unint": 2
    }

    condensed_id_to_label = {
        0:"in_both_positive",
        1:"out_negative",
        2:"neutral_unint",
    }

    id_to_condensed_id = {
         0: 0,
         1: 0,
         2: 1,
         3: 1,
         4: 0,
         5: 0,
         6: 2,
         7: 2
    }
else:
    print("unknown condensation level!")

# V1: dataset with confident labels

Note: the dimension "group" used to be called "topic" before. This is the reason why some traindata has this as the column name.

## Load confident human labels

In [None]:
def add_topic_values(df):    
    # replace missing "neutral" and "unint" entries in the [GOAL]
    # category with the corresponding entries in the [TOPIC] category
    df["[TOPIC]"] = df["[TOPIC]"]\
        .apply(lambda x: x if x in ["neutral", "unint"] else np.nan)
    df.loc[df[df["[GOAL]"].isna()].index, "[GOAL]"] = \
        df.loc[df[df["[GOAL]"].isna()].index, "[TOPIC]"].values
    df = df.dropna(subset=["[GOAL]"])
    df = df.drop(columns=["[TOPIC]"])
    return df

In [None]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[GOAL]"
fname = "confident_examples_goal{}"\
    .format(condensation_dataset_names[label_condensation])
cols = ["tweet_id", "text", dimension, "[TOPIC]"]
confident_examples = pd.DataFrame()
for pair in dpr.label_pairs:
    df1 = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    )
    df1 = add_topic_values(df1).rename(columns={dimension:"label_1"})
    
    df2 = pd.read_csv(
        Path(src, pair[1] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    )
    df2 = add_topic_values(df2).rename(columns={dimension:"label_2"})
    
    df1 = df1[df1["label_1"] != "foreign"]
    df2 = df2[df2["label_2"] != "foreign"]
    df1["label_1"] = df1["label_1"].replace(label_to_condensed_id)
    df2["label_2"] = df2["label_2"].replace(label_to_condensed_id)
    #df2 = pd.concat([df2, minority_labels])
    
    shared_ids = df1[df1["tweet_id"].isin(df2["tweet_id"])]["tweet_id"].values
    df1 = df1[df1["tweet_id"].isin(shared_ids)]
    df2 = df2[df2["tweet_id"].isin(shared_ids)]
    df1 = df1.sort_values(by="tweet_id").reset_index(drop=True)
    df2 = df2.sort_values(by="tweet_id").reset_index(drop=True)
    
    df = pd.concat([df1, df2[["label_2"]]], axis=1)[["tweet_id", "text", "label_1", "label_2"]]
    df = df[df["label_1"] == df["label_2"]]
    df = df.drop(columns=["label_2"]).rename(columns={"label_1":"label"})
    confident_examples = pd.concat([confident_examples, df])
confident_examples = confident_examples.reset_index(drop=True)
confident_examples["label"] = confident_examples["label"].astype(int)
confident_examples = dpr.clean_text(confident_examples)

In [None]:
# add additional minority class labels drawn from data sets with only a single
# label to be labelled with a second label
src = "../../../../data/labelled_samples_with_ids"
fname = "goal_minority_examples_AH.csv"
df2 = pd.read_csv(
    Path(src, fname), 
    dtype={"tweet_id":str},
    delimiter=";"
).dropna()
df2 = df2.drop(columns="[TOPIC]")
df2.columns = ["tweet_id", "text", "label_2"]
df2["label_2"] = df2["label_2"].replace(label_to_condensed_id)

# load all data with only a single label
src = "../../../../data/labelled_samples_with_ids"
dimension = "[GOAL]"
fname = "confident_examples_goal{}"\
    .format(condensation_dataset_names[label_condensation])
cols = ["tweet_id", "text", dimension, "[TOPIC]"]
df1 = pd.DataFrame()
for pair in dpr.label_pairs:
    tmp = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    )
    tmp = add_topic_values(tmp).rename(columns={dimension:"label_1"})
    
    tmp = tmp[tmp["label_1"] != "foreign"]
    tmp["label_1"] = tmp["label_1"].replace(label_to_condensed_id)
    df1 = pd.concat([df1, tmp])
    
# create a subset of examples that now has two labels and look for confident
# examples where both labels agree
shared_ids = df1[df1["tweet_id"].isin(df2["tweet_id"])]["tweet_id"].values
df1 = df1[df1["tweet_id"].isin(shared_ids)]
df2 = df2[df2["tweet_id"].isin(shared_ids)]
df1 = df1.sort_values(by="tweet_id").reset_index(drop=True)
df2 = df2.sort_values(by="tweet_id").reset_index(drop=True)
df = pd.concat([df1, df2[["label_2"]]], axis=1)[["tweet_id", "text", "label_1", "label_2"]]
df = df[df["label_1"] == df["label_2"]]
df = df.drop(columns=["label_2"]).rename(columns={"label_1":"label"})

# add the new confident examples to the existing ones
confident_examples = pd.concat([confident_examples, df])
confident_examples = confident_examples.reset_index(drop=True)
confident_examples["label"] = confident_examples["label"].astype(int)
confident_examples = dpr.clean_text(confident_examples)
confident_examples.to_csv(Path(dst, fname + ".csv"), index=False, sep=";")

In [None]:
len(confident_examples)

In [None]:
# full
#        0:"weak"
#        1:"neutral"
#        2:"other"

# full 2
#        0:"in_both_positive",
#        1:"out_negative",
#        2:"neutral_unint",

# medium
#        0:"pose",
#        1:"threat",
#        2:"weak",
#        3:"emph",
#        4:"neutral",
#        5:"unint"
confident_examples["label"].value_counts()

## Create training splits

In [None]:
fname = "confident_examples_goal{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_goal{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_goal* jlasse@nvcluster:/home/jlasse/GermanHass/goal_analysis/data_preparation/

# V2: augmented minority class examples

In [None]:
# full
#        0:"weak"
#        1:"neutral"
#        2:"other"

# full 2
#        0:"in_both_positive",
#        1:"out_negative",
#        2:"neutral_unint",

# medium
#        0:"pose",
#        1:"threat",
#        2:"weak",
#        3:"emph",
#        4:"neutral",
#        5:"unint"
confident_examples["label"].value_counts()

In [None]:
translations = dpr.select_translations(confident_examples, [0, 2])

In [None]:
translations["label"].value_counts()

## Add minority example translations

In [None]:
cols = ["tweet_id", "text", "label"]
if label_condensation == "none":
    #translations = dpr.select_translations(confident_examples, [0, 2, 3])
    #translations_in = translations[translations["label"] == 0].sample(n=300, random_state=42)
    #translations_both = translations[translations["label"] == 2]
    #translations_neutral = translations[translations["label"] == 3].sample(n=300, random_state=42)
    #new_confident_examples = pd.concat([
    #    confident_examples, 
    #    translations_in[cols],
    #    translations_both[cols],
    #    translations_neutral[cols],
    #])
    pass
elif label_condensation == "medium":
    translations = dpr.select_translations(confident_examples, [0, 1, 3, 4, 5])
    translations_pose = translations[translations["label"] == 0]
    translations_threat = translations[translations["label"] == 1]
    translations_emph = translations[translations["label"] == 3]
    translations_neutral = translations[translations["label"] == 4]
    translations_unint = translations[translations["label"] == 5]
    new_confident_examples = pd.concat([
        confident_examples, 
        translations_pose[cols],
        translations_threat[cols],
        translations_emph[cols],
        translations_neutral[cols],
        translations_unint[cols]
    ])
elif label_condensation == "full":
    translations = dpr.select_translations(confident_examples, [1, 2])
    translations_neutral = translations[translations["label"] == 1]
    translations_other = translations[translations["label"] == 2]
    new_confident_examples = pd.concat([
        confident_examples, 
        translations_neutral[cols],
        translations_other[cols],
    ])
elif label_condensation == "full2":
    translations = dpr.select_translations(confident_examples, [0, 2])
    translations_in_both_positive = translations[translations["label"] == 0]
    translations_neutral_unint = translations[translations["label"] == 2]
    new_confident_examples = pd.concat([
        confident_examples, 
        translations_in_both_positive[cols],
        translations_neutral_unint[cols],
    ])
    
new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

In [None]:
fname = "confident_examples_goal_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [None]:
fname = "confident_examples_goal_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
).dropna().reset_index(drop=True)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_goal_aug-trans{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_goal_aug-trans* jlasse@nvcluster:/home/jlasse/GermanHass/goal_analysis/data_preparation/

In [None]:
traindata["label"].value_counts()

# Create a new minority class data set for labelling

**Important**: this code needs to run with `label_condensation=halfcondensed`

## Load remaining human annotated examples

In [None]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[GOAL]"
cols = ["tweet_id", "text", dimension, "[TOPIC]"]
labelled = pd.DataFrame()
for pair in dpr.label_pairs:
    df = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    )
    df = add_topic_values(df).rename(columns={dimension:"label_1"})
    df = df[df["label_1"] != "foreign"]
    df = df.dropna(subset=["label_1"])
    df["label_1"] = df["label_1"].apply(lambda x: x.strip(" "))
    df["label_1"] = df["label_1"].replace(label_to_condensed_id)
    labelled = pd.concat([labelled, df])
labelled = labelled.reset_index(drop=True)
labelled["label_1"] = labelled["label_1"].astype(int)
labelled = labelled.rename(columns={"label_1":"label"})

In [None]:
remaining_examples = labelled[~labelled["tweet_id"].isin(confident_examples["tweet_id"])].copy()

In [None]:
len(remaining_examples)

## Create dataset

In [None]:
#        0:"pose", -> minority
#        1:"threat", -> minority
#        2:"weak",
#        3:"emph", -> minority
#        4:"neutral",
#        5:"unint"

In [None]:
pose = remaining_examples[remaining_examples["label"] == 0].sample(n=300, random_state=42)
threat = remaining_examples[remaining_examples["label"] == 1].sample(n=300, random_state=42)
emph = remaining_examples[remaining_examples["label"] == 3].sample(n=300, random_state=42)

In [None]:
data_for_labelling = pd.concat([pose, threat, emph]).sample(frac=1, random_state=42)

In [None]:
dst = "../../../../data/additional_samples"
fname = "goal_minority_examples.csv"
data_for_labelling[["tweet_id", "text"]].to_csv(
    Path(dst, fname),
    sep=";",
    index=False
)

# V3: human + inferred labels (round 1)

## Load existing confident examples

In [None]:
fname = "confident_examples_goal_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
confident_examples = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)

## Load remaining human annotated examples

In [None]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[GOAL]"
cols = ["tweet_id", "text", dimension, "[TOPIC]"]
labelled = pd.DataFrame()
for pair in dpr.label_pairs:
    df = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    )
    df = add_topic_values(df).rename(columns={dimension:"label_1"})
    df = df[df["label_1"] != "foreign"]
    df = df.dropna(subset=["label_1"])
    df["label_1"] = df["label_1"].apply(lambda x: x.strip(" "))
    df["label_1"] = df["label_1"].replace(label_to_condensed_id)
    labelled = pd.concat([labelled, df])
labelled = labelled.reset_index(drop=True)
labelled["label_1"] = labelled["label_1"].astype(int)
labelled = dpr.clean_text(labelled)

In [None]:
remaining_examples = labelled[~labelled["tweet_id"].isin(confident_examples["tweet_id"])].copy()

In [None]:
len(remaining_examples)

In [None]:
fname = "remaining_examples{}_1.csv"\
    .format(condensation_dataset_names[label_condensation])
remaining_examples.rename(columns={"label_1":"label"}).to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

In [None]:
! rsync -avze ssh remaining_examples_goal* jlasse@nvcluster:/home/jlasse/GermanHass/goal_analysis/data_preparation/

## Load inferred labels

In [None]:
# python3 infer_goal.py ../best_models/model-twitter-xlm-roberta-base_germanhass_epochs-100_batchsize-64_data-confident_examples_goal_aug-trans_condensed2_split-3 ../data_preparation/remaining_examples_goal_condensed2_1.csv full

In [None]:
# download inferred data
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/GermanHass/goal_analysis/inference/inferred_goal* ../../../data/inference/

In [None]:
# NOTE: manually change condensed2 -> condensed!!
fname = "inferred_goal_condensed_remaining_examples{}_1.csv"\
    .format(condensation_dataset_names[label_condensation])
inferred_labels = pd.read_csv(
    Path("../../../data/inference", fname),
    delimiter=";",
    dtype={"tweet_id":str, "goal":int},
    usecols=["tweet_id", "goal"]
).rename(columns={"goal":"label_2"})

## Determine label agreement

In [None]:
remaining_examples = pd.merge(
    remaining_examples,
    inferred_labels,
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

In [None]:
remaining_examples = remaining_examples.dropna(subset=["label_2"])
remaining_examples["label_2"] = remaining_examples["label_2"].astype(int)

In [None]:
new_confident_examples = remaining_examples[remaining_examples["label_1"] == remaining_examples["label_2"]]
new_confident_examples = new_confident_examples[["tweet_id", "text", "label_1"]].rename(columns={"label_1":"label"})

## Add augmented minority class examples

In [None]:
confident_examples["label"].value_counts()

In [None]:
new_confident_examples["label"].value_counts()

In [None]:
cols = ["tweet_id", "text", "label"]
if label_condensation == "none":
    #translations = dpr.select_translations(confident_examples, [0, 2])
    #translations_in = translations[translations["label"] == 0].sample(n=300, random_state=42)
    #translations_both = translations[translations["label"] == 2]
    #new_confident_examples = pd.concat([
    #    confident_examples, 
    #    new_confident_examples,
    #    translations_in[cols],
    #    translations_both[cols],
    #])
    pass
elif label_condensation == "medium":
    # since we have an overabundance of examples from the "weak" category,
    # we subsample the new confident examples for "weak"
    new_confident_examples_weak = new_confident_examples[\
        new_confident_examples["label"] == 2].sample(n=492+444, random_state=42)
    new_confident_examples_rest = new_confident_examples[\
        new_confident_examples["label"] != 2]

    translations = dpr.select_translations(new_confident_examples, [0, 1, 3, 4, 5])
    translations_pose = translations[translations["label"] == 0]
    translations_threat = translations[translations["label"] == 1]
    translations_emph = translations[translations["label"] == 3]
    translations_neutral = translations[translations["label"] == 4]
    translations_unint = translations[translations["label"] == 5]
    
    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples_weak,
        new_confident_examples_rest,
        translations_pose[cols],
        translations_threat[cols],
        translations_emph[cols],
        translations_neutral[cols],
        translations_unint[cols],
    ])
elif label_condensation == "full":
    new_confident_examples_other = new_confident_examples[\
        new_confident_examples["label"] == 0].sample(n=288, random_state=42)
    new_confident_examples_neutral = new_confident_examples[\
        new_confident_examples["label"] == 1]
    new_confident_examples_weak = new_confident_examples[\
        new_confident_examples["label"] == 2].sample(n=256, random_state=42)

    translations_neutral = dpr.select_translations(new_confident_examples, [1])
    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples_weak,
        new_confident_examples_neutral,
        new_confident_examples_other,
        translations_neutral[cols],
    ])
elif label_condensation == "full2":
    new_confident_examples_neutral_unint = new_confident_examples[\
        new_confident_examples["label"] == 2].sample(n=324, random_state=42)
    new_confident_examples_out_negative = new_confident_examples[\
        new_confident_examples["label"] == 1].sample(n=187, random_state=42)
    new_confident_examples_in_both_positive = new_confident_examples[\
        new_confident_examples["label"] == 0]
    translations_in_both_positive = dpr.select_translations(new_confident_examples, [0])
    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples_in_both_positive,
        new_confident_examples_out_negative,
        new_confident_examples_neutral_unint,
        translations_in_both_positive[cols],
    ])
else:
    print("unknown label condensation")
new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

In [None]:
fname = "confident_examples_goal_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [None]:
fname = "confident_examples_goal_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
).dropna().reset_index(drop=True)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_goal_aug-trans-inferred{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_goal_aug-trans-inferred* jlasse@nvcluster:/home/jlasse/GermanHass/goal_analysis/data_preparation/

# V4: human + inferred labels (round 2)

## Load existing confident examples

In [None]:
fname = "confident_examples_goal_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
confident_examples = pd.read_csv(
    fname,
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)

## Load remaining human annotated examples

In [None]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[GOAL]"
cols = ["tweet_id", "text", dimension, "[TOPIC]"]
labelled = pd.DataFrame()
for pair in dpr.label_pairs:
    df = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    )
    df = add_topic_values(df).rename(columns={dimension:"label_1"})
    df = df[df["label_1"] != "foreign"]
    df = df.dropna(subset=["label_1"])
    df["label_1"] = df["label_1"].apply(lambda x: x.strip(" "))
    df["label_1"] = df["label_1"].replace(label_to_condensed_id)
    labelled = pd.concat([labelled, df])
labelled = labelled.reset_index(drop=True)
labelled["label_1"] = labelled["label_1"].astype(int)
labelled = dpr.clean_text(labelled)

In [None]:
remaining_examples = labelled[~labelled["tweet_id"].isin(confident_examples["tweet_id"])].copy()

In [None]:
len(remaining_examples)

In [None]:
fname = "remaining_examples_goal{}_2.csv"\
    .format(condensation_dataset_names[label_condensation])
remaining_examples.rename(columns={"label_1":"label"}).to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

In [None]:
! rsync -avze ssh remaining_examples_goal* jlasse@nvcluster:/home/jlasse/GermanHass/goal_analysis/data_preparation/

## Load inferred labels

In [None]:
# python3 infer_goal.py ../best_models/model-twitter-xlm-roberta-base_germanhass_epochs-100_batchsize-64_data-confident_examples_goal_aug-trans-inferred_condensed_split-4 ../data_preparation/remaining_examples_goal_condensed_2.csv full

In [None]:
# download inferred data
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/GermanHass/goal_analysis/inference/inferred_goal* ../../../data/inference/

In [None]:
fname = "inferred_goal{}_remaining_examples{}_2.csv"\
    .format(condensation_dataset_names[label_condensation],
            condensation_dataset_names[label_condensation])
inferred_labels = pd.read_csv(
    Path("../../../data/inference", fname),
    delimiter=";",
    dtype={"tweet_id":str, "goal":int},
    usecols=["tweet_id", "goal"]
).rename(columns={"goal":"label_2"})

## Determine label agreement

In [None]:
remaining_examples = pd.merge(
    remaining_examples,
    inferred_labels,
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

In [None]:
remaining_examples = remaining_examples.dropna(subset=["label_2"])
remaining_examples["label_2"] = remaining_examples["label_2"].astype(int)

In [None]:
new_confident_examples = remaining_examples[remaining_examples["label_1"] == remaining_examples["label_2"]]
new_confident_examples = new_confident_examples[["tweet_id", "text", "label_1"]].rename(columns={"label_1":"label"})
new_confident_examples["label"].value_counts()

## Add augmented minority class examples

In [None]:
confident_examples["label"].value_counts()

In [None]:
translations = dpr.select_translations(new_confident_examples, [1])
translations["label"].value_counts()

In [None]:
cols = ["tweet_id", "text", "label"]
if label_condensation == "none":
    # TODO
    pass
elif label_condensation == "medium":
    # since we have an overabundance of examples from the "weak" category,
    # we subsample the new confident examples for "weak"
    new_confident_examples_weak = new_confident_examples[\
        new_confident_examples["label"] == 2].sample(n=223+217, random_state=42)
    new_confident_examples_rest = new_confident_examples[\
        new_confident_examples["label"] != 2]
    
    translations = dpr.select_translations(new_confident_examples, [0, 1, 3, 4, 5])
    translations_pose = translations[translations["label"] == 0]
    translations_threat = translations[translations["label"] == 1]
    translations_emph = translations[translations["label"] == 3]
    translations_neutral = translations[translations["label"] == 4]
    translations_unint = translations[translations["label"] == 5]

    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples_weak,
        new_confident_examples_rest,
        translations_pose[cols],
        translations_threat[cols],
        translations_emph[cols],
        translations_neutral[cols],
        translations_unint[cols],
    ])
elif label_condensation == "full":
    translations_neutral = dpr.select_translations(new_confident_examples, [1])
    new_confident_examples_weak = new_confident_examples[\
        new_confident_examples["label"] == 0].sample(n=287, random_state=42)
    new_confident_examples_neutral = new_confident_examples[\
        new_confident_examples["label"] == 1]
    new_confident_examples_other = new_confident_examples[\
        new_confident_examples["label"] == 2].sample(n=287, random_state=42)
    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples_weak,
        new_confident_examples_neutral,
        new_confident_examples_other,
        translations_neutral[cols],
    ])
elif label_condensation == "full2":
    translations_in_both_positive = dpr.select_translations(new_confident_examples, [0])
    new_confident_examples_in_both_positive = new_confident_examples[\
        new_confident_examples["label"] == 0]
    new_confident_examples_out_negative = new_confident_examples[\
        new_confident_examples["label"] == 1].sample(n=172+157, random_state=42)
    new_confident_examples_neutral_unint = new_confident_examples[\
        new_confident_examples["label"] == 2].sample(n=172+157, random_state=42)
    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples_in_both_positive,
        new_confident_examples_out_negative,
        new_confident_examples_neutral_unint,
        translations_in_both_positive[cols],
    ])
else:
    print("unknown label condensation")
new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

In [None]:
fname = "confident_examples_goal_aug-trans-inferred2{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [None]:
fname = "confident_examples_goal_aug-trans-inferred2{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_goal_aug-trans-inferred2{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_goal_aug-trans-inferred* jlasse@nvcluster:/home/jlasse/GermanHass/goal_analysis/data_preparation/