In [10]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit

import data_preparation_resources as dpr

# Label condensation stages

In [None]:
dst = "../../../data/traindata"

In [3]:
# can be "none", "medium" and "full"
label_condensation = "full"
condensation_dataset_names = {
    "none":"",
    "medium":"_halfcondensed",
    "full":"_condensed"
}

In [4]:
# label dictionary after removing "foreign"
label_to_id = {
        "in":0,
        "out":1,
        "both":2,
        "neutral":3,
        "unint":4,
    }

# condensed label dictionaries
if label_condensation == "none":
    label_to_condensed_id = label_to_id
    condensed_id_to_label = {val:key for key, val in label_to_id.items()}
elif label_condensation == "medium":
    label_to_condensed_id = {
         'in': 0,
         'out': 1,
         'both': 0,
         'neutral': 2,
         'unint':2
    }

    condensed_id_to_label = {
        0:"in_both",
        1:"out",
        2:"neutral_unint"
    }    
elif label_condensation == "full":
    label_to_condensed_id = {
         'in': 0,
         'out': 1,
         'both': 0,
         'neutral': 0,
         'unint':0
    }

    condensed_id_to_label = {
        0:"not_out",
        1:"out",
    }
else:
    print("unknown condensation level!")

# V1: dataset with confident labels

## Load confident human labels

In [11]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[GROUP]"
fname = "confident_examples_group{}"\
    .format(condensation_dataset_names[label_condensation])
cols = ["tweet_id", "text", dimension]
confident_examples = pd.DataFrame()
for pair in dpr.label_pairs:
    df1 = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_1"})
    df2 = pd.read_csv(
        Path(src, pair[1] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_2"})
    
    df1 = df1[df1["label_1"] != "foreign"]
    df2 = df2[df2["label_2"] != "foreign"]
    df1["label_1"] = df1["label_1"].replace(label_to_condensed_id)
    df2["label_2"] = df2["label_2"].replace(label_to_condensed_id)
    
    shared_ids = df1[df1["tweet_id"].isin(df2["tweet_id"])]["tweet_id"].values
    df1 = df1[df1["tweet_id"].isin(shared_ids)]
    df2 = df2[df2["tweet_id"].isin(shared_ids)]
    df1 = df1.sort_values(by="tweet_id").reset_index(drop=True)
    df2 = df2.sort_values(by="tweet_id").reset_index(drop=True)
    
    df = pd.concat([df1, df2[["label_2"]]], axis=1)[["tweet_id", "text", "label_1", "label_2"]]
    df = df[df["label_1"] == df["label_2"]]
    df = df.drop(columns=["label_2"]).rename(columns={"label_1":"label"})
    confident_examples = pd.concat([confident_examples, df])
confident_examples = confident_examples.reset_index(drop=True)
confident_examples["label"] = confident_examples["label"].astype(int)
confident_examples = dpr.clean_text(confident_examples)
confident_examples.to_csv(Path(dst, fname + ".csv"), index=False, sep=";")

In [12]:
#    "in_both":0,
#    "out":1,
#    "neutral_unint":2
confident_examples["label"].value_counts()

1    985
0    679
Name: label, dtype: int64

In [13]:
len(confident_examples)

1664

## Create training splits

In [14]:
fname = "confident_examples_group{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_group{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_group* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/

# V2: augmented minority class examples

## Add minority example translations

In [18]:
confident_examples["label"].value_counts()

1    985
0    679
Name: label, dtype: int64

In [16]:
translations = dpr.select_translations(confident_examples, [0, 2])
translations["label"].value_counts()

0    645
Name: label, dtype: int64

In [20]:
cols = ["tweet_id", "text", "label"]
if label_condensation == "none":
    translations = dpr.select_translations(confident_examples, [0, 2])
    translations_in = translations[translations["label"] == 0].sample(n=300, random_state=42)
    translations_both = translations[translations["label"] == 2]
    translations_neutral = translations[translations["label"] == 3].sample(n=300, random_state=42)
    new_confident_examples = pd.concat([
        confident_examples, 
        translations_in[cols],
        translations_both[cols],
        translations_neutral[cols],
    ])
elif label_condensation == "medium":
    translations = dpr.select_translations(confident_examples, [0])
    translations_in_both = translations[translations["label"] == 0]
    # subsample confident examples to reduce class imbalance
    confident_examples_out = confident_examples[confident_examples["label"] == 1].sample(n=485, random_state=42)
    confident_examples_other = confident_examples[confident_examples["label"].isin([0, 2])]
    new_confident_examples = pd.concat([
        confident_examples_out,
        confident_examples_other,
        translations_in_both[cols],
    ])    
else:
    translations = dpr.select_translations(confident_examples, [0]).sample(n=306, random_state=42)
    translations_not_out = translations[translations["label"] == 0]
    new_confident_examples = pd.concat([
        confident_examples, 
        translations_not_out[cols],
    ])

new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

1    985
0    985
Name: label, dtype: int64

In [21]:
fname = "confident_examples_group_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [22]:
fname = "confident_examples_groupc_aug-trans{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_group_aug-trans{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_group_aug-trans* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/

In [24]:
traindata["label"].value_counts()

1    689
0    689
Name: label, dtype: int64

# V3: human + inferred labels (round 1)

## Load existing confident examples

In [25]:
fname = "confident_examples_group_aug-trans{}_full.csv"\
    .format(condensation_dataset_names[label_condensation])
confident_examples = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)

## Load remaining human annotated examples

In [26]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[GROUP]"
cols = ["tweet_id", "text", dimension]
labelled = pd.DataFrame()
for pair in dpr.label_pairs:
    df = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_1"})
    
    df = df[df["label_1"] != "foreign"]
    df = df[df["label_1"] != "unint"]
    df = df.dropna(subset=["label_1"])
    df["label_1"] = df["label_1"].apply(lambda x: x.strip(" "))
    df["label_1"] = df["label_1"].replace(label_to_condensed_id)
    labelled = pd.concat([labelled, df])
labelled = labelled.reset_index(drop=True)
labelled["label_1"] = labelled["label_1"].astype(int)
labelled = dpr.clean_text(labelled)

In [27]:
labelled["label_1"].value_counts()

1    8535
0    3368
Name: label_1, dtype: int64

In [28]:
remaining_examples = labelled[~labelled["tweet_id"].isin(confident_examples["tweet_id"])].copy()

In [29]:
len(remaining_examples)

10464

In [30]:
fname = "remaining_examples_group{}_1.csv"\
    .format(condensation_dataset_names[label_condensation])
remaining_examples.rename(columns={"label_1":"label"}).to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

In [None]:
! rsync -avze ssh remaining_examples_group* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/

## Load inferred labels

In [197]:
# python3 infer_group.py ../best_models/model-twitter-xlm-roberta-base_germanhass_epochs-100_batchsize-64_data-confident_examples_group_aug-trans_condensed_split-3 ../data/inference/remaining_examples_group_condensed_1.csv 2

In [None]:
# download inferred data
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/* ../../../data/inference/

In [37]:
fname = "inferred_group{}_remaining_examples{}_1.csv"\
    .format(condensation_dataset_names[label_condensation],
            condensation_dataset_names[label_condensation])
inferred_labels = pd.read_csv(
    Path("../../../data/inference", fname),
    delimiter=";",
    dtype={"tweet_id":str, "group":int},
    usecols=["tweet_id", "group"]
).rename(columns={"group":"label_2"})

## Determine label agreement

In [38]:
remaining_examples = pd.merge(
    remaining_examples,
    inferred_labels,
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

In [39]:
remaining_examples = remaining_examples.dropna(subset=["label_2"])
remaining_examples["label_2"] = remaining_examples["label_2"].astype(int)

In [40]:
new_confident_examples = remaining_examples[remaining_examples["label_1"] == remaining_examples["label_2"]]
new_confident_examples = new_confident_examples[["tweet_id", "text", "label_1"]].rename(columns={"label_1":"label"})
new_confident_examples["label"].value_counts()

1    5977
0    1535
Name: label, dtype: int64

## Add augmented minority class examples

In [41]:
translations = dpr.select_translations(new_confident_examples, [0])
len(translations)

1422

In [42]:
confident_examples["label"].value_counts()

1    985
0    985
Name: label, dtype: int64

In [43]:
cols = ["tweet_id", "text", "label"]
if label_condensation == "none":
    translations = dpr.select_translations(new_confident_examples, [0, 2])
    translations_in = translations[translations["label"] == 0].sample(n=300, random_state=42)
    translations_both = translations[translations["label"] == 2]
    new_confident_examples = pd.concat([
        confident_examples, 
        new_confident_examples,
        translations_in[cols],
        translations_both[cols],
    ])
elif label_condensation == "medium":
    # use only the translated in_both examples to reduce class imbalance
    translations_in_both = dpr.select_translations(new_confident_examples, [0])
    new_confident_examples_in_both = new_confident_examples[new_confident_examples["label"] == 0]
     
    new_confident_examples = pd.concat([
        confident_examples,
        translations_in_both[cols],
        new_confident_examples_in_both
    ])
elif label_condensation == "full":
    # subsample the outgroup examples to have less class imbalance
    new_confident_examples_not_out = new_confident_examples[new_confident_examples["label"] == 0]
    new_confident_examples_out = new_confident_examples[new_confident_examples["label"] == 1].sample(n=1535+1422, random_state=42)
    
    translations = dpr.select_translations(new_confident_examples, [0])
    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples_not_out,
        new_confident_examples_out,
        translations[cols],
    ])
else:
    print("unknown label condensation")
new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

0    3942
1    3942
Name: label, dtype: int64

In [44]:
fname = "confident_examples_group_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [45]:
fname = "confident_examples_group_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_group_aug-trans-inferred{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_group_aug-trans-inferred* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/

# V4: human + inferred labels (round 2)

## Load existing confident examples

In [47]:
fname = "confident_examples_group_aug-trans-inferred{}.csv"\
    .format(condensation_dataset_names[label_condensation])
confident_examples = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)

## Load remaining human annotated examples

In [48]:
src = "../../../../data/labelled_samples_with_ids"
dimension = "[GROUP]"
cols = ["tweet_id", "text", dimension]
labelled = pd.DataFrame()
for pair in dpr.label_pairs:
    df = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    ).rename(columns={dimension:"label_1"})
    
    df = df[df["label_1"] != "foreign"]
    df = df[df["label_1"] != "unint"]
    df = df.dropna(subset=["label_1"])
    df["label_1"] = df["label_1"].apply(lambda x: x.strip(" "))
    df["label_1"] = df["label_1"].replace(label_to_condensed_id)
    labelled = pd.concat([labelled, df])
labelled = labelled.reset_index(drop=True)
labelled["label_1"] = labelled["label_1"].astype(int)
labelled = dpr.clean_text(labelled)

In [49]:
labelled["label_1"].value_counts()

1    8535
0    3368
Name: label_1, dtype: int64

In [50]:
remaining_examples = labelled[~labelled["tweet_id"].isin(confident_examples["tweet_id"])].copy()

In [51]:
len(remaining_examples)

5972

In [52]:
fname = "remaining_examples_group{}_2.csv"\
    .format(condensation_dataset_names[label_condensation])
remaining_examples.rename(columns={"label_1":"label"}).to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

In [None]:
! rsync -avze ssh ../../../data/traindata/remaining_examples_group* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/

## Load inferred labels

In [216]:
# python3 infer_group.py ../best_models/model-twitter-xlm-roberta-base_germanhass_epochs-100_batchsize-64_data-confident_examples_group_aug-trans-inferred_condensed_split-4 ../data/traindata/remaining_examples_condensed_2.csv 2

In [None]:
# download inferred data
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/inferred_group* ../../../data/inference/

In [55]:
fname = "inferred_group{}_remaining_examples{}_2.csv"\
    .format(condensation_dataset_names[label_condensation],
            condensation_dataset_names[label_condensation])
inferred_labels = pd.read_csv(
    Path("../../../data/inference", fname),
    delimiter=";",
    dtype={"tweet_id":str, "group":int},
    usecols=["tweet_id", "group"]
).rename(columns={"group":"label_2"})

## Determine label agreement

In [56]:
remaining_examples = pd.merge(
    remaining_examples,
    inferred_labels,
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

In [57]:
remaining_examples = remaining_examples.dropna(subset=["label_2"])
remaining_examples["label_2"] = remaining_examples["label_2"].astype(int)

In [63]:
new_confident_examples = remaining_examples[remaining_examples["label_1"] == remaining_examples["label_2"]]
new_confident_examples = new_confident_examples[["tweet_id", "text", "label_1"]].rename(columns={"label_1":"label"})
new_confident_examples["label"].value_counts()

1    3369
0     123
Name: label, dtype: int64

## Add augmented minority class examples

In [59]:
confident_examples["label"].value_counts()

0    3942
1    3942
Name: label, dtype: int64

In [60]:
translations = dpr.select_translations(new_confident_examples, [0])
len(translations)

113

In [64]:
cols = ["tweet_id", "text", "label"]
if label_condensation == "none":
    translations = dpr.select_translations(new_confident_examples, [0, 2])
    translations_in = translations[translations["label"] == 0].sample(n=300, random_state=42)
    translations_both = translations[translations["label"] == 2]
    new_confident_examples = pd.concat([
        confident_examples, 
        new_confident_examples,
        translations_in[cols],
        translations_both[cols],
    ])
elif label_condensation == "medium":
    # use only the translated in_both examples to reduce class imbalance
    translations_in_both = dpr.select_translations(new_confident_examples, [0])
    new_confident_examples_in_both = new_confident_examples[new_confident_examples["label"] == 0]
    new_confident_examples_out = new_confident_examples[\
        new_confident_examples["label"] == 1].sample(n=712, random_state=42)
    new_confident_examples_neutral_unint = new_confident_examples[\
        new_confident_examples["label"] == 2].sample(n=712, random_state=42)
     
    new_confident_examples = pd.concat([
        confident_examples,
        translations_in_both[cols],
        new_confident_examples_in_both,
        new_confident_examples_out,
        new_confident_examples_neutral_unint
    ])
    
elif label_condensation == "full":
    translations_not_out = dpr.select_translations(new_confident_examples, [0])
    
    # subsample the outgroup examples to have less class imbalance
    new_confident_examples_not_out = new_confident_examples[new_confident_examples["label"] == 0]
    new_confident_examples_out = new_confident_examples[new_confident_examples["label"] == 1].sample(n=236, random_state=42)
    new_confident_examples = pd.concat([
        confident_examples,
        new_confident_examples_not_out,
        new_confident_examples_out,
        translations_not_out[cols]
    ])
else:
    print("unknown label condensation")
    
new_confident_examples = new_confident_examples.reset_index(drop=True)
new_confident_examples = new_confident_examples.sample(frac=1, replace=False)
new_confident_examples["label"].value_counts()

1    4178
0    4178
Name: label, dtype: int64

In [65]:
fname = "confident_examples_group_aug-trans-inferred2{}.csv"\
    .format(condensation_dataset_names[label_condensation])
new_confident_examples.to_csv(
    Path(dst, fname),
    index=False,
    sep=";"
)

## Create training splits

In [66]:
fname = "confident_examples_group_aug-trans-inferred2{}.csv"\
    .format(condensation_dataset_names[label_condensation])
data = pd.read_csv(
    Path(dst, fname),
    dtype={"tweet_id":str, "label":int, "text":str},
    delimiter=";"
)
test_frac = 0.15
eval_frac = 0.15

for s, seed in enumerate([42, 43, 44, 45, 46]):
    # get the eval data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(data["text"], data["label"])
    for tmp_index, eval_index in sss.split(data["text"], data["label"]):
        X_tmp, X_eval = data["text"].loc[tmp_index], data["text"].loc[eval_index]
        y_tmp, y_eval = data["label"].loc[tmp_index], data["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test data
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    tmp_fname = "confident_examples_group_aug-trans-inferred2{}"\
        .format(condensation_dataset_names[label_condensation])
    traindata.to_csv(Path(dst, tmp_fname + f"_train_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(dst, tmp_fname + f"_test_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(dst, tmp_fname + f"_eval_{s+1}.csv"), index=False, sep=";")
data.to_csv(Path(dst, tmp_fname + "_full.csv"), index=False, sep=";")

In [None]:
! rsync -avze ssh ../../../data/traindata/confident_examples_group_aug-trans-inferred2* jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/traindata/