In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from os import listdir
import re

In [None]:
src = "../../../data/"

In [2]:
fname = "flat_trees.csv.gzip"

dtypes = {
    "tweet_id":str, 
    "created_at":str,
    "root_account":str,
    "tree_id":str,
    "in_reply_to":str,
    "tree_nr":int,
    "tweet_nr":int,
    "text":str,
    "hate_score":float,
    "counter_score":float,
    "TOXICITY":float,
}
tweets = pd.read_csv(
    Path(src, fname), 
    compression="gzip", 
    dtype=dtypes,
    usecols=dtypes.keys(),
    parse_dates=["created_at"]
)

# Emotions

In [12]:
# process texts for emotion detection and upload to GPU cluster for inference
'''
tweets["text_preprocessed"] = tweets["text"]\
    .apply(lambda x: re.sub(r"https?:\/\/\S*", "", x, flags=re.MULTILINE))
texts = tweets["text_preprocessed"]
np.savetxt("texts_for_emodetection.txt", texts, fmt="%s")
'''

In [15]:
# ! rsync -avze ssh texts_for_emodetection.txt jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/
# ! rm texts_for_emodetection.txt

sending incremental file list
texts_for_emodetection.txt

sent 51,747,100 bytes  received 35 bytes  20,698,854.00 bytes/sec
total size is 132,136,005  speedup is 2.55


In [None]:
# on nvcluster in /home/jasse/german_emotion_classification/
# prediction_multigpu_german.0.1.1.py ../counterspeech-strategies/data/inference/texts_for_emodetection.txt

In [21]:
# fetch the data from the GPU cluster
#! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/texts_for_emodetection.txt_german_emodetection ../../../data/inference/
#! mv texts_for_emodetection.txt_german_emodetection emotions.tsv
#! xz emotions.tsv --verbose

receiving incremental file list
texts_for_emodetection.txt_german_emodetection

sent 43 bytes  received 51,219,904 bytes  9,312,717.64 bytes/sec
total size is 119,048,517  speedup is 2.32


In [3]:
# read the emotion labels
dtypes = {
    "anger":float,
    "fear":float,
    "disgust":float,
    "sadness":float,
    "joy":float,
    "enthusiasm":float,
    "pride":float,
    "hope":float
}

emotions = pd.read_csv(
    Path(src, "inference", "emotions.tsv.xz"),
    delimiter="\t",
    compression="xz",
    dtype=dtypes
)
emotions.head(3)

Unnamed: 0,anger,fear,disgust,sadness,joy,enthusiasm,pride,hope
0,0.525099,0.557333,0.01135,0.377578,0.012516,0.022883,0.011487,0.08653
1,0.322441,0.074408,0.003789,0.015819,0.156091,0.224147,0.119278,0.807087
2,0.110118,0.036345,0.009142,0.025803,0.928969,0.049298,0.178908,0.332939


In [4]:
# add emotion labels to tweets data frame. Note that emotion labels are in the
# same order as the tweet texts
assert len(emotions) == len(tweets)
tweets = pd.concat([tweets, emotions], axis=1)

# User IDs

In [5]:
src = "../../../data/"
fname = "flat_trees_ids_only.csv.gzip"
IDs = pd.read_csv(
    Path(src, fname), 
    dtype={"tweet_id":str, "user_twitter_id":str},
    compression="gzip"
)

In [6]:
tweets = pd.merge(
    tweets,
    IDs,
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)
del IDs

# Language

**Note:** The foreign tweets are already removed from the inferred tweets.

In [10]:
#! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/flat_trees_inferred_foreign.csv.gzip . --progress

In [11]:
'''
fname = "flat_trees_inferred_foreign.csv.gzip"
df = pd.read_csv(Path(src, "inference", fname), compression="gzip", dtype={"tweet_id":str})
tweets = pd.merge(tweets, df, how="left", left_on="tweet_id", right_on="tweet_id")
'''

'\nfname = "flat_trees_inferred_foreign.csv.gzip"\ndf = pd.read_csv(fname, compression="gzip", dtype={"tweet_id":str})\ntweets = pd.merge(tweets, df, how="left", left_on="tweet_id", right_on="tweet_id")\n'

# Strategy

In [12]:
#! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/inferred_strategy_condensed_flat_trees.csv ../../../data/inference --progress
#! xz inferred_strategy_condensed_flat_trees.csv --verbose

In [13]:
fname = "inferred_strategy_condensed_flat_trees.csv.xz"
cols = ["tweet_id", "strategy", "construct", "opin", "sarc",
        "leave_fact", "other_new"]
df = pd.read_csv(
    Path(src, "inference", fname), 
    compression="xz", 
    delimiter=";",
    usecols=cols,
    dtype={
        "tweet_id":str, 
        "strategy":int, 
        "construct":float,
        "opin":float, 
        "sarc":float, 
        "leave_fact":float,
        "other_new":float
    }
)
strategy_dict = {
    0:"construct",
    1:"opin",
    2:"sarc",
    3:"leave_fact",
    4:"other_new",
}
df["strategy"] = df["strategy"].replace(strategy_dict)
df = df.rename(columns={
    "construct":"strategy_construct",
    "opin":"strategy_opin",
    "sarc":"strategy_sarc",
    "leave_fact":"strategy_leave_fact",
    "other_new":"strategy_other_new"
})

In [14]:
df["strategy"].value_counts()

opin          513078
leave_fact    334934
construct     248954
other_new     197881
sarc           17058
Name: strategy, dtype: int64

In [12]:
cols = ["tweet_id", "strategy", "strategy_construct", "strategy_opin", 
        "strategy_sarc", "strategy_leave_fact", "strategy_other_new"]
tweets = pd.merge(
    tweets, 
    df[cols], 
    how="left", 
    left_on="tweet_id", 
    right_on="tweet_id"
)

# Group

In [23]:
# python3 infer_group.py ../best_models/model-twitter-xlm-roberta-base_germanhass_epochs-100_batchsize-64_data-confident_examples_group_aug-trans-inferred3_halfcondensed_split-4 /home/jlasse/counterspeech-strategies/data/inference/flat_trees.csv.gzip 3

In [None]:
#! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/inferred_group_halfcondensed_flat_trees.csv ../../../data/inference/ --progress
#! xz inferred_group_halfcondensed_flat_trees.csv --verbose

In [None]:
fname = "inferred_group_halfcondensed_flat_trees.csv.xz"
df = pd.read_csv(
    Path(src, "inference", fname), 
    compression="xz",
    delimiter=";",
    dtype={
        "tweet_id":str, 
        "group":int, 
        "in_both":float, 
        "out":float,
        "neutral_unint":float
    }
)
df = df.rename(columns={
    "in_both":"group_in_both",
    "out":"group_out",
    "neutral_unint":"group_neutral_unint"
})

group_dict = {
    0:"in_both",
    1:"out",
    2:"neutral_unint"
}
df["group"] = df["group"].replace(group_dict)

In [None]:
df["group"].value_counts()

In [27]:
cols = ["tweet_id", "group", 
        "group_in_both", "group_out", "group_neutral_unint"]
tweets = pd.merge(
    tweets, 
    df[cols], 
    how="left", 
    left_on="tweet_id", 
    right_on="tweet_id"
)

# Goal

In [33]:
# python3 infer_goal.py ../best_models/model-twitter-xlm-roberta-base_germanhass_epochs-100_batchsize-64_data-confident_examples_goal_aug-trans-inferred2_condensed_split-4 /home/jlasse/counterspeech-strategies/data/inference/flat_trees.csv.gzip 3

In [34]:
#! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/inferred_goal_condensed_flat_trees.csv ../../../data/inference --progress
#! xz inferred_goal_condensed_flat_trees.csv --verbose

In [35]:
fname = "inferred_goal_condensed_flat_trees.csv.xz"
df = pd.read_csv(
    Path(src, "inference", fname), 
    compression="xz",
    delimiter=";",
    dtype={
        "tweet_id":str, 
        "goal":int, 
        "in_both_positive":float, 
        "out_negative":float,
        "neutral_unint":float
    }
)
# old inference script hat the wrong labels ...
df.columns = ["tweet_id", "text", "goal", "in_both_positive", "out_negative", "neutral_unint"]
df = df.rename(columns={
    "in_both_positive":"goal_in_both_positive",
    "out_negative":"goal_out_negative",
    "neutral_unint":"goal_neutral_unint"
})

goal_dict = {
    0:"in_both_positive",
    1:"out_negative",
    2:"neutral_unint"
}
df["goal"] = df["goal"].replace(goal_dict)

In [36]:
df["goal"].value_counts()

neutral_unint       608085
out_negative        510024
in_both_positive    193796
Name: goal2, dtype: int64

In [37]:
cols = ["tweet_id", "goal", 
        "goal_in_both_positive", "goal_out_negative", "goal_neutral_unint"]
tweets = pd.merge(
    tweets, 
    df[cols],
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

# Hate

In [38]:
#! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/inferred_hate_condensed_flat_trees.csv ../../../data/inference/ --progress
#! xz inferred_hate_condensed_flat_trees.csv --verbose

In [39]:
fname = "inferred_hate_condensed_flat_trees.csv.xz"
df = pd.read_csv(
    Path(src, "inference", fname), 
    compression="xz",
    delimiter=";",
    dtype={
        "tweet_id":str,
        "hate":int,
        "yes":float,
        "no":float,
    }
)
hate_dict = {
    0:"yes",
    1:"no",
}
df["hate"] = df["hate"].replace(hate_dict)
df = df.rename(columns={
    "yes":"hate_yes",
    "no":"hate_no"
})

In [None]:
df["hate"].value_counts()

In [41]:
cols = ["tweet_id", "hate", "hate_yes", "hate_no"]
tweets = pd.merge(
    tweets, 
    df[cols],
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

# Target

In [42]:
#! rsync -avze ssh jlasse@nvcluster:/home/jlasse/counterspeech-strategies/data/inference/inferred_target_condensed_flat_trees.csv ../../../data/inference/ --progress
#! xz inferred_target_condensed_flat_trees.csv --verbose

In [43]:
fname = "inferred_target_condensed_flat_trees.csv.xz"
df = pd.read_csv(
    Path(src, "inference", fname), 
    compression="xz",
    delimiter=";",
    dtype={
        "tweet_id":str, 
        "target":int,
        "inst":float,
        "right-wing":float,
        "left-wing":float,
        "vulnerable":float,
        "other_new":float
    }
)
target_dict = {
    0:"inst",
    1:"right-wing",
    2:"left-wing",
    3:"vulnerable",
    4:"other_new"
}
df["target"] = df["target"].replace(target_dict)
df = df.rename(columns={
    "inst":"target_inst",
    "right-wing":"target_right-wing",
    "left-wing":"target_left-wing",
    "vulnerable":"target_vulnerable",
    "other_new":"target_other_new"
})

In [None]:
df["target"].value_counts()

In [45]:
cols = ["tweet_id", "target", "target_inst", 
        "target_right-wing", "target_left-wing",
        "target_vulnerable", "target_other_new"]
tweets = pd.merge(
    tweets, 
    df[cols], 
    how="left", 
    left_on="tweet_id",
    right_on="tweet_id"
)

In [46]:
# target is only defined for tweets where hate == "yes"
idx = tweets[tweets["hate"] == "no"].index
tweets.loc[idx, "target"] = np.nan
tweets.loc[idx, "target_inst"] = np.nan
tweets.loc[idx, "target_right-wing"] = np.nan
tweets.loc[idx, "target_left-wing"] = np.nan
tweets.loc[idx, "target_vulnerable"] = np.nan
tweets.loc[idx, "target_other_new"] = np.nan

# Add human labels

## Only confident labels

In [47]:
# data sets with manually re-assigned tweet IDs. The original labelled data sets
# are located in labelled_samples/
! ls ../../../data/labelled_samples_with_ids

batch_10_EM.csv     batch_2_AS.csv     batch_6_AH.csv
batch_10_EM_LT.csv  batch_2_EM.csv     batch_6_AH_LT.csv
batch_10_LT.csv     batch_2_LT.csv     batch_6_AS_AH.csv
batch_10_LT_EM.csv  batch_3_AH.csv     batch_6_AS.csv
batch_11_EM.csv     batch_3_AH_LT.csv  batch_6_LT_AH.csv
batch_11_EM_LT.csv  batch_3_AS_AH.csv  batch_6_LT.csv
batch_11_LT.csv     batch_3_AS.csv     batch_7a_LT_AH.csv
batch_11_LT_EM.csv  batch_3_LT_AH.csv  batch_7a_LT.csv
batch_12_EM.csv     batch_3_LT.csv     batch_7b_LT_AH.csv
batch_12_EM_LT.csv  batch_4_AH_AS.csv  batch_7b_LT.csv
batch_12_LT.csv     batch_4_AH.csv     batch_8_EM.csv
batch_12_LT_EM.csv  batch_4_AS_AH.csv  batch_8_EM_LT.csv
batch_13_EM.csv     batch_4_AS.csv     batch_8_LT.csv
batch_13_EM_LT.csv  batch_4_LT_AH.csv  batch_8_LT_EM.csv
batch_13_LT.csv     batch_4_LT.csv     batch_9_EM.csv
batch_13_LT_EM.csv  batch_5_AH.csv     batch_9_EM_LT.csv
batch_14_EM_AH.csv  batch_5_AH_LT.csv  batch_9_LT.csv
batch_14_EM.csv     batch_5_AS_AH.csv  batch_9_LT_EM.c

In [48]:
label_pairs = [
    ("batch_1_AS", "batch_1_LT"),
    ("batch_3_AH", "batch_3_AH_LT"),
    ("batch_3_AS", "batch_3_AS_AH"),
    ("batch_3_LT", "batch_3_LT_AH"),
    ("batch_4_AH", "batch_4_AH_AS"),
    ("batch_4_AS", "batch_4_AS_AH"),
    ("batch_4_LT", "batch_4_LT_AH"),
    ("batch_5_LT", "batch_5_LT_AH"),
    ("batch_5_AS", "batch_5_AS_AH"),
    ("batch_5_AH", "batch_5_AH_LT"),
    ("batch_6_AH", "batch_6_AH_LT"),
    ("batch_6_AS", "batch_6_AS_AH"),
    ("batch_6_LT", "batch_6_LT_AH"),
    ("batch_7a_LT", "batch_7a_LT_AH"),
    ("batch_7b_LT", "batch_7b_LT_AH"),
    ("batch_8_LT", "batch_8_LT_EM"),
    ("batch_8_EM", "batch_8_EM_LT"),
    ("batch_9_LT", "batch_9_LT_EM"),
    ("batch_9_EM", "batch_9_EM_LT"),
    ("batch_10_LT", "batch_10_LT_EM"),
    ("batch_10_EM", "batch_10_EM_LT"),
    ("batch_11_LT", "batch_11_LT_EM"),
    ("batch_11_EM", "batch_11_EM_LT"),
    ("batch_12_LT", "batch_12_LT_EM"),
    ("batch_12_EM", "batch_12_EM_LT"),
    ("batch_13_LT", "batch_13_LT_EM"),
    ("batch_13_EM", "batch_13_EM_LT"),
    ("batch_14_EM", "batch_14_EM_AH")
]

In [49]:
def add_group_values(df):    
    # replace missing "neutral" and "unint" entries in the [GOAL]
    # category with the corresponding entries in the [GROUP] category
    df["[GROUP]"] = df["[GROUP]"]\
        .apply(lambda x: x if x in ["neutral", "unint"] else np.nan)
    df.loc[df[df["[GOAL]"].isna()].index, "[GOAL]"] = \
        df.loc[df[df["[GOAL]"].isna()].index, "[GROUP]"].values
    df = df.dropna(subset=["[GOAL]"])
    df = df.drop(columns=["[GROUP]"])
    return df

In [50]:
def get_additional_goal_labels(confident_examples):
    # add additional minority class labels drawn from data sets with only a single
    # label to be labelled with a second label
    src = "../../../data/labelled_samples_with_ids"
    fname = "goal_minority_examples_AH.csv"
    df2 = pd.read_csv(
        join(src, fname), 
        dtype={"tweet_id":str},
        delimiter=";"
    ).dropna()
    df2 = df2.drop(columns="[GROUP]")
    df2.columns = ["tweet_id", "text", "label_2"]
    #df2["label_2"] = df2["label_2"].replace(label_to_condensed_id)

    # load all data with only a single label
    src = "../../../data/labelled_samples_with_ids"
    #dimension = "[GOAL]"
    #fname = "confident_examples_goal{}"\
    #    .format(condensation_dataset_names[label_condensation])
    cols = ["tweet_id", "text", "[GOAL]", "[GROUP]"]
    df1 = pd.DataFrame()
    for pair in label_pairs:
        tmp = pd.read_csv(
            join(src, pair[0] + ".csv"),
            dtype={"tweet_id":str},
            delimiter=";",
            usecols=cols
        )
        tmp = add_group_values(tmp).rename(columns={"[GOAL]":"label_1"})

        tmp = tmp[tmp["label_1"] != "foreign"]
        #tmp["label_1"] = tmp["label_1"].replace(label_to_condensed_id)
        df1 = pd.concat([df1, tmp])

    # create a subset of examples that now has two labels and look for confident
    # examples where both labels agree
    shared_ids = df1[df1["tweet_id"].isin(df2["tweet_id"])]["tweet_id"].values
    df1 = df1[df1["tweet_id"].isin(shared_ids)]
    df2 = df2[df2["tweet_id"].isin(shared_ids)]
    df1 = df1.sort_values(by="tweet_id").reset_index(drop=True)
    df2 = df2.sort_values(by="tweet_id").reset_index(drop=True)
    df = pd.concat([df1, df2[["label_2"]]], axis=1)[["tweet_id", "text", "label_1", "label_2"]]
    df = df[df["label_1"] == df["label_2"]]
    df = df.drop(columns=["label_2"]).rename(columns={"label_1":"label"})

    # add the new confident examples to the existing ones
    confident_examples = pd.concat([confident_examples, df])
    confident_examples = confident_examples.reset_index(drop=True)
    #confident_examples["label"] = confident_examples["label"].astype(int)
    
    return confident_examples

In [51]:
def get_confident_examples_traindata(dimension, pair):
    src = "../../../data/labelled_samples_with_ids"
    cols = ["tweet_id", dimension]
    if dimension == "[GOAL]":
        cols = ["tweet_id", dimension, "[GROUP]"]
    df1 = pd.read_csv(
        join(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    )
    df2 = pd.read_csv(
        join(src, pair[1] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    )
    
    if dimension == "[GOAL]":
        df1 = add_group_values(df1).rename(columns={dimension:"label_1"})
        df2 = add_group_values(df2).rename(columns={dimension:"label_2"})
    else:
        df1 = df1.rename(columns={dimension:"label_1"})
        df2 = df2.rename(columns={dimension:"label_2"})

    df1 = df1[df1["label_1"] != "foreign"]
    df2 = df2[df2["label_2"] != "foreign"]

    shared_ids = df1[df1["tweet_id"].isin(df2["tweet_id"])]["tweet_id"].values
    df1 = df1[df1["tweet_id"].isin(shared_ids)]
    df2 = df2[df2["tweet_id"].isin(shared_ids)]
    df1 = df1.sort_values(by="tweet_id").reset_index(drop=True)
    df2 = df2.sort_values(by="tweet_id").reset_index(drop=True)

    df = pd.concat([df1, df2[["label_2"]]], axis=1)[["tweet_id", "label_1", "label_2"]]
    df = df[df["label_1"] == df["label_2"]]
    df = df.drop(columns=["label_2"]).rename(columns={"label_1":"label"})
    df = df[df["label"] != "foreign"]
    
    if dimension == "[GOAL]":
        df = get_additional_goal_labels(df)
    return df[["tweet_id", "label"]]

In [52]:
def calculate_confidence(row):
    '''Determine the rater agreement across four ratings'''
    labels = list(row[[f"label_{i}" for i in range(1, 5)]].values)
    if len(set(labels)) == 1:
        return "unanimous"
    elif len(set(labels)) == 3:
        return "weak-majority"
    elif len(set(labels)) == 4:
        return "disagreement"
    elif len(set(labels)) == 2 and labels.count(labels[0]) == 2:
        return "split"
    else:
        return "majority"

In [53]:
def get_confidence_label(row, weak_majority=True):
    '''Determine the majority label across four ratings'''
    labels = list(row[[f"label_{i}" for i in range(1, 5)]].values)
    # unanimous agreement
    if len(set(labels)) == 1:
        return labels[0]
    # two raters have the same label, the others are different
    elif len(set(labels)) == 3:
        # if a weak majority is taken as agreement, return the majority label
        if weak_majority:
            return row[[f"label_{i}" for i in range(1, 5)]].value_counts().index[0]
        # if weak majority is taken as disagreement, return NaN
        else:
            return np.nan
    # complete disagreement
    elif len(set(labels)) == 4:
        return np.nan
    # split decision
    elif len(set(labels)) == 2 and labels.count(labels[0]) == 2:
        return np.nan
    # strong majority agreement
    else:
        return row[[f"label_{i}" for i in range(1, 5)]].value_counts().index[0]

In [54]:
def get_confident_examples_testdata(dimension):
    src = "../../../data/labelled_samples_with_ids"
    raters = ["AH", "AS", "EM", "LT"]
    labels = {}
    for i, rater in enumerate(raters):
        tmp = pd.read_csv(
            join(src, f"batch_2_{rater}.csv"),
            dtype={"tweet_id":str},
            delimiter=";",
            usecols=["tweet_id", "text", dimension]
        )
        tmp = tmp.sort_values(by="tweet_id").reset_index(drop=True)
        tmp = tmp.rename(columns={dimension:f"label_{i+1}"})
        labels[rater] = tmp

    df = labels[raters[0]]
    for i, rater in enumerate(raters[1:]):
        df[f"label_{i+2}"] = labels[rater][f"label_{i+2}"]
    df = df[["tweet_id", "text"] + [f"label_{i}" for i in range(1, 5)]].copy()
    
    df["confidence"] = df.apply(calculate_confidence, axis=1)
    df["label"] = df.apply(get_confidence_label, weak_majority=False, axis=1)
    df = df.dropna(subset=["label"])
    df = df[df["label"] != "foreign"]
    
    return df[["tweet_id", "label"]]

In [55]:
src = "../../../../data/labelled_samples_with_ids"
dimensions = ["[STRAGETY]", "[GROUP]", "[GOAL]", 
              "[TARGET]", "[HATE]"]
dimension_map = {
    "[STRATEGY]":"strategy",
    "[GROUP]":"group",
    "[GOAL]":"goal",
    "[HATE]":"hate",
    "[TARGET]":"target"
}

for dimension in dimensions:
    confident_examples = pd.DataFrame()
    for pair in label_pairs:
        # get confident examples from traindata
        confident_examples = pd.concat([
            confident_examples, 
            get_confident_examples_traindata(dimension, pair)
        ])
    # get confident examples from testdata
    confident_examples = pd.concat([
        confident_examples,
        get_confident_examples_testdata(dimension)
    ])
    
    confident_examples = confident_examples.reset_index(drop=True)
    confident_examples = confident_examples.\
        rename(columns={"label":"{}_human_label_confident"\
                        .format(dimension_map[dimension])})
    confident_examples = confident_examples.drop_duplicates()
    
    assert len(confident_examples) == len(confident_examples["tweet_id"].unique())
    
    tweets = pd.merge(
        tweets, 
        confident_examples,
        how="left",
        left_on="tweet_id",
        right_on="tweet_id"
    )

## All labels

In [56]:
src = "../../../data/labelled_samples_with_ids"
raters = {
    "1":["LT"],
    "2":["LT"],
    "3":["AH", "AS", "LT"],
    "4":["AH", "AS", "LT"],
    "5":["AH", "AS", "LT"],
    "6":["AH", "AS", "LT"],
    "7a":["LT"],
    "7b":["LT"],
    "8":["EM", "LT"],
    "9":["EM", "LT"],
    "10":["EM", "LT"],
    "11":["EM", "LT"],
    "12":["EM", "LT"],
    "13":["EM", "LT"],
    "14":["EM"]
}

human_labels = pd.DataFrame()
for batch in raters.keys():#range(1, N_batches + 1):
    labelled = pd.concat([pd.read_csv(
        join(src, f"batch_{batch}_{rater}.csv"),
        delimiter=";", dtype={"tweet_id":str}
    ) for rater in raters[batch]])

    if batch in ["7a", "7b"]: batch = "7"
    batch = int(batch)
    labelled["labelling_batch"] = batch
    labelled = labelled.rename(columns={
        "[STRAGEGY]":"strategy_human_label",
        "[GROUP]":"group_human_label",
        "[GOAL]":"goal_human_label",
        "[HATE]":"hate_human_label",
        "[TARGET]":"target_human_label"
    })
    
    human_labels = pd.concat([human_labels, labelled])
human_labels = human_labels.drop_duplicates(subset=["tweet_id"])
assert len(human_labels) == len(human_labels["tweet_id"].unique())
human_labels = human_labels.drop(columns=["text"])

In [57]:
tweets = pd.merge(
    tweets, 
    human_labels, 
    how="left", 
    left_on="tweet_id", 
    right_on="tweet_id"
)

In [58]:
tweets["labelling_batch"].value_counts()

3.0     1536
5.0     1520
4.0     1508
6.0     1393
1.0     1108
2.0     1070
8.0     1014
12.0    1012
13.0    1009
10.0    1008
9.0     1006
11.0    1005
7.0     1004
14.0     501
Name: labelling_batch, dtype: int64

In [None]:
tweets.columns

# Export validation data for the goal dimension

In [None]:
export_candidates = tweets[(tweets["goal_human_label"].isna()) &\
                           (tweets["goal_human_label_confident"].isna())]
export_candidates["goal"].value_counts()

In [63]:
export_in_both_positive = export_candidates[export_candidates["goal"] == "in_both_positive"]\
    .sample(n=67, random_state=42)
export_out_negative = export_candidates[export_candidates["goal"] == "out_negative"]\
    .sample(n=67, random_state=42)
export_neutral_unint = export_candidates[export_candidates["goal"] == "neutral_unint"]\
    .sample(n=67, random_state=42)

In [64]:
src = "../../../data/additional_samples"
fname = "goal_validation_samples_with_inferred_label.csv"
export = pd.concat([
    export_in_both_positive,
    export_out_negative,
    export_neutral_unint
]).sample(frac=1, random_state=42)
export[["tweet_id", "text", "goal"]].to_csv(join(src, fname), index=False)
fname = "goal_validation_samples.csv"
export[["tweet_id", "text"]].to_csv(join(src, fname), index=False)

# Export inferred data

In [7]:
tweets["strategy_human_label"].value_counts()

opin            4646
unint           1767
sarc            1750
insult-inst     1299
insult-pers     1144
insult-polit     839
insult-ism       835
inconsist        772
info             685
other            622
conseq           385
foreign          348
quest            312
correct          288
Name: strategy_human_label, dtype: int64

In [9]:
tweets["strategy_human_label"].value_counts().sum()

15692

In [60]:
fname = "inferred_data.csv.gzip"
tweets.drop(columns=["text"]).to_csv(
    fname, 
    compression="gzip", 
    index=False
)