In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit

import data_preparation_resources as dpr

# Label condensation stages

In [None]:
dst = "../../../data/traindata"

In [None]:
label_condensation = "full"
condensation_dataset_names = {
    "medium":"_halfcondensed",
    "full":"_condensed"
}

In [None]:
# full label dictionary after removing "foreign"
label_to_id = {
    "info":0,
    "opin":1,
    "quest":2,
    "conseq":3,
    "correct":4,
    "inconsist":5,
    "sarc":6,
    "insult-pers":7,
    "insult-ism":8,
    "insult-polit":9,
    "insult-inst":10,
    "other":11,
    "unint":12,
}
id_to_label = {val:key for key, val in label_to_id.items()}

# condensed label dictionaries
if label_condensation == "medium":
    label_to_condensed_id = {
         'info': 0,
         'opin': 1,
         'quest': 2,
         'conseq': 2,
         'correct': 2,
         'inconsist': 3,
         'sarc': 4,
         'insult-pers': 5,
         'insult-ism': 5,
         'insult-polit': 5,
         'insult-inst': 5,
         'other': 6,
         'unint': 6
    }
    
    condensed_id_to_label = {
        0:"info",
        1:"opin",
        2:"construct",
        3:"inconsist",
        4:"sarc",
        5:"insult",
        6:"other_new",
    }
    
    id_to_condensed_id = {
         0: 0,
         1: 1,
         2: 2,
         3: 2,
         4: 2,
         5: 3,
         6: 4,
         7: 5,
         8: 5,
         9: 5,
         10: 5,
         11: 6,
         12: 6}
    label_to_condensed_label = {
         'info': "info",
         'opin': "opin",
         'quest': "construct",
         'conseq': "construct",
         'correct': "construct",
         'inconsist': "inconsist",
         'sarc': "sarc",
         'insult-pers': "insult",
         'insult-ism': "insult",
         'insult-polit': "insult",
         'insult-inst': "insult",
         'other': "other_new",
         'unint': "other_new"    
    }

    condensed_label_to_condensed_id = {
        "info":0,
        "opin":1,
        "construct":2,
        "inconsust":3,
        "sarc":4,
        "insult":5,
        "other_new":6
    }

elif label_condensation == "full":
    label_to_condensed_id = {
         'info': 0,
         'opin': 1,
         'quest': 0,
         'conseq': 0,
         'correct': 0,
         'inconsist': 0,
         'sarc': 2,
         'insult-pers': 3,
         'insult-ism': 3,
         'insult-polit': 3,
         'insult-inst': 3,
         'other': 4,
         'unint': 4}

    condensed_id_to_label = {
        0:"construct",
        1:"opin",
        2:"sarc",
        3:"leave_fact",
        4:"other_new",
    }

    id_to_condensed_id = {
         0: 0,
         1: 1,
         2: 0,
         3: 0,
         4: 0,
         5: 0,
         6: 2,
         7: 3,
         8: 3,
         9: 3,
         10: 3,
         11: 4,
         12: 4}
    
    label_to_condensed_label = {
         'info': "construct",
         'opin': "opin",
         'quest': "construct",
         'conseq': "construct",
         'correct': "construct",
         'inconsist': "construct",
         'sarc': "sarc",
         'insult-pers': "leave_fact",
         'insult-ism': "leave_fact",
         'insult-polit': "leave_fact",
         'insult-inst': "leave_fact",
         'other': "other_new",
         'unint': "other_new"    
    }

    condensed_label_to_condensed_id = {
        "construct":0,
        "opin":1,
        "sarc":2,
        "leave_fact":3,
        "other_new":4,
    }
else:
    print("unknown condensation level!")


# Backtranslations

# V1: dataset with confident labels

## Load confident human labels

In [None]:
src = "../../../data/labelled_samples_with_ids"
dimension = "[STRATEGY]"
fname = "confident_examples_strategy{}"\
    .format(condensation_dataset_names[label_condensation])
cols = ["tweet_id", "text", dimension]
confident_examples = pd.DataFrame()
for pair in dpr.label_pairs:
    df1 = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_1"})
    df2 = pd.read_csv(
        Path(src, pair[1] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_2"})
    
    df1 = df1[df1["label_1"] != "foreign"]
    df2 = df2[df2["label_2"] != "foreign"]
    df1["label_1"] = df1["label_1"].replace(label_to_condensed_id)
    df2["label_2"] = df2["label_2"].replace(label_to_condensed_id)
    
    shared_ids = df1[df1["tweet_id"].isin(df2["tweet_id"])]["tweet_id"].values
    df1 = df1[df1["tweet_id"].isin(shared_ids)]
    df2 = df2[df2["tweet_id"].isin(shared_ids)]
    df1 = df1.sort_values(by="tweet_id").reset_index(drop=True)
    df2 = df2.sort_values(by="tweet_id").reset_index(drop=True)
    
    df = pd.concat([df1, df2[["label_2"]]], axis=1)[["tweet_id", "text", "label_1", "label_2"]]
    df = df[df["label_1"] == df["label_2"]]
    df = df.drop(columns=["label_2"]).rename(columns={"label_1":"label"})
    confident_examples = pd.concat([confident_examples, df])
confident_examples = confident_examples.reset_index(drop=True)
confident_examples["label"] = confident_examples["label"].astype(int)

# remove URLs
confident_examples = dpr.clean_text(confident_examples)

confident_examples.to_csv(Path(dst,fname + ".csv"), index=False, sep=";")

In [None]:
#        0:"construct",
#        1:"opin",
#        2:"sarc",
#        3:"leave_fact",
#        4:"other_new",
confident_examples["label"].value_counts()

In [None]:
len(confident_examples)

## Create training splits

In [None]:
fname = "confident_examples_strategy{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_strategy{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
# upload traindata to GPU cluster
! rsync -avze ssh ../../../data/traindata/confident_examples_strategy* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/

# V2: augmented minority class examples

In [None]:
#        0:"construct",
#        1:"opin",
#        2:"sarc",
#        3:"leave_fact",
#        4:"other_new",
confident_examples["label"].value_counts()

## Add minority example translations

In [None]:
translations = dpr.select_translations(confident_examples, [0, 2, 3, 4])

In [None]:
translations["label"].value_counts()

In [None]:
cols = ["tweet_id", "text", "label"]

if label_condensation == "medium":
    translations = dpr.select_translations(confident_examples, [0, 2, 3, 4, 5, 6])
    translations_info = translations[translations["label"] == 0]
    translations_construct = translations[translations["label"] == 2]
    translations_inconsist = translations[translations["label"] == 3]
    translations_sarc = translations[translations["label"] == 4]
    translations_insult = translations[translations["label"] == 5].sample(n=45, random_state=42)
    translations_other_new = translations[translations["label"] == 6].sample(n=140, random_state=42)
    new_confident_examples = pd.concat([
        confident_examples, 
        translations_info[cols],
        translations_construct[cols],
        translations_inconsist[cols],
        translations_sarc[cols],
        translations_insult[cols],
        translations_other_new[cols],
    ])
else:
    translations_construct = translations[translations["label"] == 0]
    translations_sarc = translations[translations["label"] == 2]
    translations_leave_fact = translations[translations["label"] == 3].sample(n=45, random_state=42)
    translations_other_new = translations[translations["label"] == 4].sample(n=140, random_state=42)
    new_confident_examples = pd.concat([
        confident_examples, 
        translations_construct[cols],
        translations_sarc[cols],
        translations_leave_fact[cols],
        translations_other_new[cols],
    ])
    pass

new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

In [None]:
fname = "confident_examples_strategy_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [None]:
fname = "confident_examples_strategy_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_strategy_aug-trans{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh -./../../data/traindata/confident_examples_strategy_aug-trans* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/

In [None]:
traindata["label"].value_counts()

# V3: human + inferred labels (round 1)

## Load existing confident examples

In [None]:
fname = "confident_examples_strategy_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
confident_examples = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)

## Load remaining human annotated examples

In [None]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[STRATEGY]"
cols = ["tweet_id", "text", dimension]
labelled = pd.DataFrame()
for pair in dpr.label_pairs:
    df = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_1"})
    
    df = df[df["label_1"] != "foreign"]
    df = df.dropna(subset=["label_1"])
    df["label_1"] = df["label_1"].apply(lambda x: x.strip(" "))
    df["label_1"] = df["label_1"].replace(label_to_condensed_id)
    labelled = pd.concat([labelled, df])
labelled = labelled.reset_index(drop=True)
labelled["label_1"] = labelled["label_1"].astype(int)
labelled = dpr.clean_text(labelled)

In [None]:
labelled["label_1"].value_counts()

In [None]:
remaining_examples = labelled[~labelled["tweet_id"].isin(confident_examples["tweet_id"])].copy()
len(remaining_examples)

In [None]:
fname = "remaining_examples_strategy{}_1.csv"\
    .format(condensation_dataset_names[label_condensation])
remaining_examples.rename(columns={"label_1":"label"}).to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

In [None]:
! rsync -avze ssh ../../../data/traindata/remaining_examples_strategy* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/

## Load inferred labels

In [None]:
# python3 infer_strategy.py ../best_models/model-twitter-xlm-roberta-base_germanhass_epochs-100_batchsize-64_data-confident_examples_strategy_aug-trans_halfcondensed_split-3 ../data/traindata/remaining_examples_strategy_condensed_1.csv 5

In [None]:
# download inferred data
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/inferred_strategy* ../../../data/inference/

In [None]:
fname = "inferred_strategy{}_remaining_examples{}_1.csv"\
    .format(condensation_dataset_names[label_condensation],
            condensation_dataset_names[label_condensation])
inferred_labels = pd.read_csv(
    Path("../../../data/inference", fname),
    delimiter=";",
    dtype={"tweet_id":str, "strategy":int},
    usecols=["tweet_id", "strategy"]
).rename(columns={"strategy":"label_2"})

## Determine label agreement

In [None]:
remaining_examples = pd.merge(
    remaining_examples,
    inferred_labels,
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

In [None]:
remaining_examples = remaining_examples.dropna(subset=["label_2"])
remaining_examples["label_2"] = remaining_examples["label_2"].astype(int)

In [None]:
new_confident_examples = remaining_examples[remaining_examples["label_1"] == remaining_examples["label_2"]]
new_confident_examples = new_confident_examples[["tweet_id", "text", "label_1"]].rename(columns={"label_1":"label"})
new_confident_examples["label"].value_counts()

## Add augmented minority class examples

In [None]:
confident_examples["label"].value_counts()

In [None]:
cols = ["tweet_id", "text", "label"]

if label_condensation == "medium":
    translations = dpr.select_translations(new_confident_examples, [0, 1, 2, 3, 4, 6])
    translations_info = translations[translations["label"] == 0]
    translations_opin = translations[translations["label"] == 1].sample(n=413, random_state=42)
    translations_construct = translations[translations["label"] == 2]
    translations_inconsist = translations[translations["label"] == 3]
    translations_sarc = translations[translations["label"] == 4]
    translations_other_new = translations[translations["label"] == 6]
    
    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples,
        translations_info[cols],
        translations_opin[cols],
        translations_construct[cols],
        translations_inconsist[cols],
        translations_sarc[cols],
        translations_other_new[cols],
    ])
else:
    # subsample the opin examples to have less class imbalance
    new_confident_examples_construct = new_confident_examples[new_confident_examples["label"] == 0]
    new_confident_examples_opin = new_confident_examples[new_confident_examples["label"] == 1].sample(n=1000, random_state=42)
    new_confident_examples_sarc = new_confident_examples[new_confident_examples["label"] == 2]
    new_confident_examples_leave_fact = new_confident_examples[new_confident_examples["label"] == 3]
    new_confident_examples_other_new = new_confident_examples[new_confident_examples["label"] == 4]

    translations = dpr.select_translations(new_confident_examples, [0, 2, 3, 4])
    translations_construct = translations[translations["label"] == 0]
    translations_sarc = translations[translations["label"] == 2]
    translations_leave_fact = translations[translations["label"] == 3]
    translations_other_new = translations[translations["label"] == 4]
    
    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples_construct,
        new_confident_examples_opin,
        new_confident_examples_sarc,
        new_confident_examples_leave_fact,
        new_confident_examples_other_new,
        translations_construct[cols],
        translations_sarc[cols],
        translations_leave_fact[cols],
        translations_other_new[cols],
    ])
    
new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

In [None]:
fname = "confident_examples_strategy_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [None]:
fname = "confident_examples_strategy_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_strategy_aug-trans-inferred{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_strategy_aug-trans-inferred* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/

# V4: human + inferred labels (round 2)

## Load existing confident examples

In [None]:
fname = "confident_examples_strategy_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
confident_examples = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)

## Load remaining human annotated examples

In [None]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[STRATEGY]"
cols = ["tweet_id", "text", dimension]
labelled = pd.DataFrame()
for pair in dpr.label_pairs:
    df = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_1"})
    
    df = df[df["label_1"] != "foreign"]
    df = df.dropna(subset=["label_1"])
    df["label_1"] = df["label_1"].apply(lambda x: x.strip(" "))
    df["label_1"] = df["label_1"].replace(label_to_condensed_id)
    labelled = pd.concat([labelled, df])
labelled = labelled.reset_index(drop=True)
labelled["label_1"] = labelled["label_1"].astype(int)

In [None]:
labelled["label_1"].value_counts()

In [None]:
remaining_examples = labelled[~labelled["tweet_id"].isin(confident_examples["tweet_id"])].copy()
len(remaining_examples)

In [None]:
fname = "remaining_examples_strategy{}_2.csv"\
    .format(condensation_dataset_names[label_condensation])
remaining_examples["label_1"] = remaining_examples["label_1"]\
    .replace(id_to_condensed_id)
remaining_examples.rename(columns={"label_1":"label"}).to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

In [None]:
! rsync -avze ssh ../../../data/traindata/remaining_examples_strategy* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/

## Load inferred labels

In [None]:
# python3 infer_strategy.py ../best_models/model-twitter-xlm-roberta-base_germanhass_epochs-100_batchsize-64_data-confident_examples_strategy_aug-trans-inferred_halfcondensed_split-1 ../data/traindata/remaining_examples_strategy_halfcondensed_2.csv 7

In [None]:
# download inferred data
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/inferred_strategy* ../../../data/inference/

In [None]:
fname = "inferred_strategy{}_remaining_examples{}_2.csv"\
    .format(condensation_dataset_names[label_condensation],
            condensation_dataset_names[label_condensation])
inferred_labels = pd.read_csv(
    Path("../../../data/inference", fname),
    delimiter=";",
    dtype={"tweet_id":str, "strategy":int},
    usecols=["tweet_id", "strategy"]
).rename(columns={"strategy":"label_2"})

## Determine label agreement

In [None]:
remaining_examples = pd.merge(
    remaining_examples,
    inferred_labels,
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

In [None]:
remaining_examples = remaining_examples.dropna(subset=["label_2"])
remaining_examples["label_2"] = remaining_examples["label_2"].astype(int)

In [None]:
new_confident_examples = remaining_examples[remaining_examples["label_1"] == remaining_examples["label_2"]]
new_confident_examples = new_confident_examples[["tweet_id", "text", "label_1"]].rename(columns={"label_1":"label"})
new_confident_examples["label"].value_counts()

In [None]:
confident_examples["label"].value_counts()

In [None]:
translations = dpr.select_translations(new_confident_examples, [3, 5, 6])

In [None]:
len(translations[translations["label"] == 3])

In [None]:
translations = select_translations(confident_examples, [3])
translations["translation_id"] = [f"t{i}" for i in range(len(translations))]
translations_inconsist_old = translations.sample(n=2140, random_state=42)
translations_inconsist_new = translations[~translations["translation_id"].isin(translations_inconsist_old["translation_id"])]

In [None]:
len(translations_inconsist_new)

In [None]:
len(translations)

In [None]:
len(translations.drop_duplicates(subset=["text"]))

In [None]:
len(translations_inconsist_old)

## Add augmented minority class examples

In [None]:
cols = ["tweet_id", "text", "label"]

if label_condensation == "medium":
    translations = select_translations(new_confident_examples, [0, 2, 3, 4])
    translations_info = translations[translations["label"] == 0].sample(n=500, random_state=42)
    translations_construct = translations[translations["label"] == 2].sample(n=429, random_state=42)
    translations_inconsist = translations[translations["label"] == 3]
    translations_sarc = translations[translations["label"] == 4].sample(n=361, random_state=42)
    #translations_insult = translations[translations["label"] == 5].sample(n=289, random_state=42)
    #translations_other_new = translations[translations["label"] == 6].sample(n=1751, random_state=42)
    
    # since we don't generate enough new translations for classes 3, 4 and 5
    # but have some translations remaining from the initial confident samples,
    # we get the translations that were not used in the last traning pass and
    # add them here as well
    translations = select_translations(confident_examples, [3, 4, 5])
    translations_inconsistent_old = translations[translations["label"] == 3].sample(n=373, random_state=42)
    translations_inconsistent_new = translations[~translations["text"].isin(translations_neutral_old["text"])]
    
    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples,
        translations_info[cols],
        translations_construct[cols],
        translations_inconsist[cols],
        translations_sarc[cols],
        translations_insult[cols],
        translations_other_new[cols],
    ])
else:
    pass
new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

In [None]:
fname = "confident_examples_strategy_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [None]:
fname = "confident_examples_strategy_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_strategy_aug-trans-inferred{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_strategy_aug-trans-inferred* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/