In [1]:
import os
import re
import pandas as pd
import numpy as np
from os.path import join
from transformers import XLMRobertaForSequenceClassification
from transformers import XLMRobertaTokenizerFast

# Functions

In [2]:
def get_existing_samples(existing_sample_dir, verbose=False):
    existing_sample_files = os.listdir(existing_sample_dir)
    existing_batches = np.asarray([int(re.findall(r'\d+', f)[0]) for f in existing_sample_files])
    existing_sample_counter = existing_batches.max()
    existing_samples = pd.DataFrame()
    
    print("loading existing samples", existing_sample_files)
    for f in existing_sample_files:
        tmp = pd.read_csv(join(existing_sample_dir, f))
        existing_samples = pd.concat([existing_samples, tmp])
        
    existing_samples['label'] = np.nan
    existing_samples.loc[existing_samples['hate_score'] >= 0.8, 'label'] = 'hate'
    existing_samples.loc[existing_samples['counter_score'] >= 0.8, 'label'] = 'counter'
    existing_samples.loc[(existing_samples['hate_score'] >= 0.44) & \
                         (existing_samples['hate_score'] <= 0.55), 'label'] = 'neutral'
    
    if verbose:
        print("existing sample spread to hate / counter / neutral:")
        print(existing_samples['label'].value_counts())
        print()
        print("existing sample spread to the years 2015 / 2016 / 2017 / 2018")
        print(existing_samples["year"].value_counts())
        print()

    return existing_samples, existing_sample_counter


def get_sample_pool(sample_pool_dir):
    hate_df = pd.read_csv(join(sample_pool_dir, "hate.csv"))
    hate_df["hate_label"] = "hate"
    counter_df = pd.read_csv(join(sample_pool_dir, "counter.csv"))
    counter_df["hate_label"] = "counter"
    neutral_df = pd.read_csv(join(sample_pool_dir, "neutral.csv"))
    neutral_df["hate_label"] = "neutral"
    
    hate_df = hate_df.drop_duplicates(subset=["tweet_id"])
    counter_df = counter_df.drop_duplicates(subset=["tweet_id"])
    neutral_df = neutral_df.drop_duplicates(subset=["tweet_id"])
    
    print(f"sample pool hate: {len(hate_df)}")
    print(f"sample pool counter: {len(counter_df)}")
    print(f"sample pool neutral: {len(neutral_df)}")
    print()
    
    # combine hate, counter and neutral into one pool of available samples
    df = pd.concat([hate_df, counter_df, neutral_df])
    
    print("sample pool 2015: {}".format(len(df[df["year"] == 2015])))
    print("sample pool 2016: {}".format(len(df[df["year"] == 2016])))
    print("sample pool 2017: {}".format(len(df[df["year"] == 2017])))
    print("sample pool 2018: {}".format(len(df[df["year"] == 2018])))
    print()
    
    return df


def create_sample(
    df, 
    year_sample_sizes={2015:125, 2016:125, 2017:125, 2018:125},
    years=[2015, 2016, 2017, 2018],
    minority_dimension="strategy",
    minority_classes=[],
    seed=None):

    if len(minority_classes) > 0:
        df = df[df[minority_dimension].isin(minority_classes)]
    
    frames = []
    for year in years:
        frames += [
            get_tweets_by_year(df, year, 
                    year_sample_size=year_sample_sizes[year], seed=seed),
        ]
    
    df = pd.concat(frames)
    return df


def get_tweets_by_year(df, year, year_sample_size=100, seed=None):
    df = df[df.year==year]
    sampled_df = df.sample(n=year_sample_size, random_state=seed)
    
    # in cases where there aren't many tweets for a given year, the desired
    # sample size might be larger than the remaining tweets in the sampling pool
    assert len(sampled_df) == year_sample_size
    
    return sampled_df

# Build sample

## Sampling principles
Starting with batch 7, which is the first batch drawn with predictions from transformer models for language and strategy
* remove all foreign language tweets
* 20% of samples from year 2016, 40% from 2017 and 40% from 2018
* combine hate, counter and neutral into a single sample pool
* oversample "construct", "sarc" and "leave_fact"

## Sampling workflow
* (optional): update label mapping, update training data, copy training data to NVcluster
* train transformer model for strategy on the latest labelled data on the NVcluster
* copy best strategy (or goal) model to `sampling_scripts/strategy_model/` on the NVcluster
* (optional): update label mapping in the output of `sampling_scripts/infer_strategy.py`
* run strategy (or goal) inference on NVcluster
* download the data with the inferred values (rsync command below)
* run rest of this notebook to create new sample batch

In [7]:
# get the best strategy model
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/GermanHass/strategy_analysis/roberta/results/best_model/ best_model/ --progress

receiving incremental file list
config.json
          1,050 100%    1.00MB/s    0:00:00 (xfr#1, to-chk=2/4)
pytorch_model.bin
  1,112,278,560 100%  354.29MB/s    0:00:02 (xfr#2, to-chk=1/4)
training_args.bin
          2,799 100%    2.37kB/s    0:00:01 (xfr#3, to-chk=0/4)

sent 233,635 bytes  received 2,754,078 bytes  663,936.22 bytes/sec
total size is 1,112,282,409  speedup is 372.29


In [9]:
# upload the best strategy model
! rsync -avze ssh best_model/ jlasse@nvcluster:/home/jlasse/GermanHass/sampling_scripts/strategy_model/ --progress

sending incremental file list
config.json
          1,050 100%    0.00kB/s    0:00:00 (xfr#1, to-chk=2/4)
pytorch_model.bin
  1,112,278,560 100%   22.53MB/s    0:00:47 (xfr#2, to-chk=1/4)
training_args.bin
          2,799 100%   28.77kB/s    0:00:00 (xfr#3, to-chk=0/4)

sent 1,028,946,087 bytes  received 233,627 bytes  20,379,796.32 bytes/sec
total size is 1,112,282,409  speedup is 1.08


In [10]:
# upload the best language model
! rsync -avze ssh ../foreign_language_prediction/best_model/ jlasse@nvcluster:/home/jlasse/GermanHass/sampling_scripts/language_model/

sending incremental file list
./
config.json
pytorch_model.bin
training_args.bin

sent 1,028,873,347 bytes  received 76 bytes  37,413,579.02 bytes/sec
total size is 1,112,269,490  speedup is 1.08


In [3]:
# download the inferred values
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/GermanHass/sampling_scripts/data_split_in_classes/ /home/jana/Projects/CSS_reconquista_internet/analysis/data/tree_samples/data_split_in_classes_inferred_strategy

receiving incremental file list
counter.csv
hate.csv
neutral.csv

sent 67,951 bytes  received 25,317,865 bytes  1,538,534.30 bytes/sec
total size is 58,934,255  speedup is 2.32


In [4]:
df = pd.read_csv("/home/jana/Projects/CSS_reconquista_internet/analysis/data/tree_samples/data_split_in_classes_inferred_strategy/hate.csv")
df["strategy"].value_counts()

1    33339
3    17619
0    13445
2    11133
4     5001
Name: strategy, dtype: int64

In [5]:
df["goal"].value_counts()

2    35705
5    18966
3     9398
1     7352
4     6067
0     3049
Name: goal, dtype: int64

In [3]:
# condensed id mapping
#     'info': 0,
#     'opin': 1,
#     'quest': 0,
#     'conseq': 0,
#     'correct': 0,
#     'inconsist': 0,
#     'sarc': 2,
#     'insult-pers': 3,
#     'insult-ism': 3,
#     'insult-polit': 3,
#     'insult-inst': 3,
#     'other': 4,
#     'unint': 4

condensed_id_to_label = {
    0:"construct",
    1:"opin",
    2:"sarc",
    3:"leave_fact",
    4:"other_new",
}
minority_dimension = "strategy"
minority_classes = [0, 2, 3]

In [4]:
# condensed id mapping
#     'strength': 0, # pose
#     'just': 0, 
#     'threat': 1,
#     'weak': 2,
#     'emph-ground': 3, # emph
#     'emph-prob': 3,
#     'neutral': 4,
#     'unint': 5

condensed_id_to_label = {
    0:"pose",
    1:"threat",
    2:"weak",
    3:"emph",
    4:"neutral",
    5:"unint"
}
minority_dimension = "goal"
minority_classes = [0, 3]  

In [5]:
data_dir = "/home/jana/Projects/CSS_reconquista_internet/analysis/data/"
existing_sample_dir = join(data_dir, "tree_samples/samples")
sample_pool_dir = join(data_dir, "tree_samples/data_split_in_classes_inferred_strategy")

foreign_language_classification = True
seed = 42 # note: batch_1_LT_AS.csv was sampled by Joshua without a seed
# note: batches 2-11 with strategy dimension oversampling (construct, sarc & leave fact)
# note: batches 12, 13 with goal dimension oversampling (pose & emph)

In [6]:
raters = ["EM"]
co_raters = {"EM":"AH"}
year_sample_sizes = {2015:0, 2016:100, 2017:200, 2018:200}
years = [2015, 2016, 2017, 2018]
co_rater_frac = 0.1

# following the naming convention, the batch counter does not increase for
# batches which are only rated by a single rater. I.e. there will be a
# batch_3_AS, batch_3_LT and batch_3_AH file for the three raters AS, LT and AH
# for the third batch. To ensure this behaviour, we get the batch counter before
# we iterate over the raters to create new files.
_, existing_sample_counter = get_existing_samples(existing_sample_dir)

for rater in raters:
    print(f"*** drawing samples for batch {existing_sample_counter + 1} for rater {rater} ***")
    
    # load the existing samples including samples that were created for the
    # current batch, but do not increase the batch counter
    existing_samples, _ = get_existing_samples(existing_sample_dir, verbose=True)

    # load the available pool of examples
    sample_pool = get_sample_pool(sample_pool_dir)
    
    # remove foreign language entries from the available pool of samples
    if foreign_language_classification:
        sample_pool = sample_pool[sample_pool["foreign"] == 0]

    # remove the existing samples from the available pool of examples
    df = sample_pool[~sample_pool["tweet_id"].isin(existing_samples["tweet_id"])].copy()
    print("remaining samples hate: {}".format(len(df[df["hate_label"] == "hate"])))
    print("remaining samples counter: {}".format(len(df[df["hate_label"] == "counter"])))
    print("remaining samples neutral: {}".format(len(df[df["hate_label"] == "neutral"])))
    print()
    print("remaining samples 2015: {}".format(len(df[df["year"] == 2015])))
    print("remaining samples 2016: {}".format(len(df[df["year"] == 2016])))
    print("remaining samples 2017: {}".format(len(df[df["year"] == 2017])))
    print("remaining samples 2018: {}".format(len(df[df["year"] == 2018])))
    print()
    #del sample_pool
    
    sample = create_sample(
        df, 
        year_sample_sizes=year_sample_sizes,
        years=years,
        minority_dimension=minority_dimension,
        minority_classes=minority_classes,
        seed=42)
    
    # replace semicolons with commas, because we use semicolons as delimiters
    sample["text"] = sample["text"].apply(lambda x: x.replace(";", ","))
    
    # sanity check ensuring that no Tweets in the newly created sample are
    # already included in the existing samples
    assert len(set(sample["tweet_id"])\
               .intersection(set(existing_samples["tweet_id"]))) == 0
    
    # sanity check that there are no duplicated tweets in the sample
    assert len(sample) == len(sample["tweet_id"].drop_duplicates())
    
    # save the new sample to disk. The file name encodes the batch number and
    # rater
    sample_name = f"batch_{existing_sample_counter + 1}_{rater}.csv"
    sample = sample.drop(columns=["foreign", "hate_label", "strategy"] + \
                         list(condensed_id_to_label.values()))
    sample.to_csv(join(existing_sample_dir, sample_name), index=False)
    
    total_sample_size = sum(list(year_sample_sizes.values()))
    co_rater_sample_size = int(total_sample_size * co_rater_frac)
    co_rater_sample = sample.sample(n=co_rater_sample_size, random_state=seed)
    co_rater_sample_name = f"batch_{existing_sample_counter + 1}_{rater}_{co_raters[rater]}.csv"
    co_rater_sample.to_csv(join(existing_sample_dir, co_rater_sample_name),
                           index=False)
    
    print()
    print("****************************")
    print()

loading existing samples ['batch_3_AH_LT.csv', 'batch_4_AH.csv', 'batch_9_LT_EM.csv', 'batch_9_EM.csv', 'batch_5_LT_AH.csv', 'batch_6_AS_AH.csv', 'batch_7b_LT.csv', 'batch_8_EM_LT.csv', 'batch_11_EM.csv', 'batch_5_LT.csv', 'batch_5_AH.csv', 'batch_13_LT.csv', 'batch_12_LT_EM.csv', 'batch_3_AH.csv', 'batch_12_EM.csv', 'batch_10_LT_EM.csv', 'batch_9_EM_LT.csv', 'batch_4_AS_AH.csv', 'batch_7_LT2_AH.csv', 'batch_8_EM.csv', 'batch_3_AS_AH.csv', 'batch_3_LT_AH.csv', 'batch_11_LT.csv', 'batch_2_LT_AS_AH.csv', 'batch_10_EM.csv', 'batch_13_EM_LT.csv', 'batch_10_LT.csv', 'batch_4_LT_AH.csv', 'batch_4_AH_LT.csv', 'batch_11_LT_EM.csv', 'batch_7_LT1_AH.csv', 'batch_6_AH_LT.csv', 'batch_7a_LT.csv', 'batch_5_AH_LT.csv', 'batch_8_LT_EM.csv', 'batch_6_AH.csv', 'batch_13_EM.csv', 'batch_13_LT_EM.csv', 'batch_3_LT.csv', 'batch_4_LT.csv', 'batch_1_LT_AS.csv', 'batch_6_LT.csv', 'batch_8_LT.csv', 'batch_10_EM_LT.csv', 'batch_9_LT.csv', 'batch_12_EM_LT.csv', 'batch_6_AS.csv', 'batch_4_AS.csv', 'batch_12_LT.c