In [1]:
import os
import re
import pandas as pd
import numpy as np
from os.path import join
from joblib import load

# Functions

In [2]:
def get_existing_samples(existing_sample_dir, verbose=False):
    existing_sample_files = os.listdir(existing_sample_dir)
    existing_sample_files.sort()
    existing_sample_counter = int(existing_sample_files[-1].split("_")[1])
    existing_samples = pd.DataFrame()
    
    print("loading existing samples", existing_sample_files)
    for f in existing_sample_files:
        tmp = pd.read_csv(join(existing_sample_dir, f))
        existing_samples = pd.concat([existing_samples, tmp])
        
    existing_samples['label'] = np.nan
    existing_samples.loc[existing_samples['hate_score'] >= 0.8, 'label'] = 'hate'
    existing_samples.loc[existing_samples['counter_score'] >= 0.8, 'label'] = 'counter'
    existing_samples.loc[(existing_samples['hate_score'] >= 0.44) & \
                         (existing_samples['hate_score'] <= 0.55), 'label'] = 'neutral'
    
    if verbose:
        print("existing sample spread to hate / counter / neutral:")
        print(existing_samples['label'].value_counts())
        print()
        print("existing sample spread to the years 2015 / 2016 / 2017 / 2018")
        print(existing_samples["year"].value_counts())
        print()

    return existing_samples, existing_sample_counter


def get_sample_pool(sample_pool_dir):
    hate_df = pd.read_csv(join(sample_pool_dir, "hate.csv"))
    hate_df["hate_label"] = "hate"
    counter_df = pd.read_csv(join(sample_pool_dir, "counter.csv"))
    counter_df["hate_label"] = "counter"
    neutral_df = pd.read_csv(join(sample_pool_dir, "neutral.csv"))
    neutral_df["hate_label"] = "neutral"
    
    hate_df = hate_df.drop_duplicates(subset=["tweet_id"])
    counter_df = counter_df.drop_duplicates(subset=["tweet_id"])
    neutral_df = neutral_df.drop_duplicates(subset=["tweet_id"])
    
    print(f"sample pool hate: {len(hate_df)}")
    print(f"sample pool counter: {len(counter_df)}")
    print(f"sample pool neutral: {len(neutral_df)}")
    print()
    
    # combine hate, counter and neutral into one pool of available samples
    df = pd.concat([hate_df, counter_df, neutral_df])
    
    print("sample pool 2015: {}".format(len(df[df["year"] == 2015])))
    print("sample pool 2016: {}".format(len(df[df["year"] == 2016])))
    print("sample pool 2017: {}".format(len(df[df["year"] == 2017])))
    print("sample pool 2018: {}".format(len(df[df["year"] == 2018])))
    print()
    
    return df


def clean_text(df):
    # remove only URLs
    df["text_clean"] = df["text"]\
        .apply(lambda x: re.sub(r"https?:\/\/\S*", "", x, flags=re.MULTILINE))

    # lowercase all text
    df["text_clean"] = df["text"]\
        .apply(lambda x: x.lower())
    
    return df


def create_sample(
    df, 
    year_sample_sizes={2015:125, 2016:125, 2017:125, 2018:125},
    years=[2015, 2016, 2017, 2018],
    classifier_src="../strategy_analysis_roberta",
    classifier_model="MultinomialNB",
    embedding="TfidfVectorizer",
    ensemble_raters=["AS", "LT"],
    N=1000,
    class_label_dict=None,
    class_bias=False,
    seed=None):
    
    classifiers = get_classifiers(
        classifier_src, 
        classifier_model,
        embedding,
        ensemble_raters,
        N
    )
    
    df = ensemble_prediction(df, classifiers, ensemble_raters)
    if class_bias:
        df["pred_label"] = df["pred"].replace(class_label_dict)
        df = df[df["pred_label"] == class_bias]
    
    frames = []
    for year in years:
        frames += [
            get_tweets_by_year(df, year, 
                    year_sample_size=year_sample_sizes[year], seed=seed),
        ]
    
    df = pd.concat(frames)
    return df


def ensemble_prediction(df, classifiers, ensemble_raters):
    for rater in ensemble_raters:
        X = classifiers[rater]["embedding"].transform(df["text_clean"]).toarray()
        pred = classifiers[rater]["classifier"].predict(X)
        df[f"pred_{rater}"] = pred

    # retain only entries for which all classifiers agree
    df = df[df[[f"pred_{rater}" for rater in ensemble_raters]]\
                .apply(lambda x: len(set(x.values)) == 1, axis=1)]
    df = df.drop(columns=[f"pred_{rater}" for rater in ensemble_raters][1:] + ["text_clean"])
    df = df.rename(columns={f"pred_{ensemble_raters[0]}":"pred"})  
    return df


def get_classifiers(src, classifier_model, embedding, ensemble_raters, N):
    classifiers = {rater:{} for rater in ensemble_raters}
    for rater in ensemble_raters:
        tfidf = load(join(src, "models", embedding, f"rater-{rater}_N-{N}.joblib"))
        clf = load(join(src, "models", classifier_model, f"rater-{rater}_N-{N}.joblib")) 
        classifiers[rater]["embedding"] = tfidf
        classifiers[rater]["classifier"] = clf
        
    return classifiers


def get_tweets_by_year(df, year, year_sample_size=100, seed=None):
    df = df[df.year==year]
    sampled_df = df.sample(n=year_sample_size, random_state=seed)
    
    # in cases where there aren't many tweets for a given year, the desired
    # sample size might be larger than the remaining tweets in the sampling pool
    assert len(sampled_df) == year_sample_size
    
    return sampled_df

# Build sample

## Sampling principles:
* 25% of samples from each of the four years (2015, 2016, 2017, 2018)
* combine hate, counter and neutral into a single sample pool
* oversample a given class (most likely the minority class)

In [3]:
check_existing_samples = True
seed = 42 # note: batch_1_LT_AS.csv was sampled by Joshua without a seed
data_dir = "/home/jana/Projects/CSS_reconquista_internet/analysis/data/"
existing_sample_dir = join(data_dir, "tree_samples/samples")
sample_pool_dir = join(data_dir, "tree_samples/data_split_in_classes")

# labels of the condensed classes

## setup used for drawing batch 3
#condensed_id_to_label = {
#    0:"minority",
#    1:"opin",
#    2:"other",
#    3:"foreign"
#}

## setup used for drawing batch 4 & 5
condensed_id_to_label = {
    0:"minority",
    1:"opin",
    2:"sarc",
    3:"other",
    4:"unint",
    5:"foreign"
}

In [4]:
raters = ["AS", "LT", "AH"]
co_raters = {"AS":"AH", "LT":"AH", "AH":"LT"}
classifier_src = "../strategy_analysis/tfidf"
classifier_model = "LinearSVC"
embedding = "TfidfVectorizer"
year_sample_sizes = {2015:0, 2016:100, 2017:200, 2018:200}
years = [2015, 2016, 2017, 2018]
co_rater_frac = 0.1
N_labelled_samples = 5500

# following the naming convention, the batch counter does not increase for
# batches which are only rated by a single rater. I.e. there will be a
# batch_3_AS, batch_3_LT and batch_3_AH file for the three raters AS, LT and AH
# for the third batch. To ensure this behaviour, we get the batch counter before
# we iterate over the raters to create new files.
_, existing_sample_counter = get_existing_samples(existing_sample_dir)

for rater in raters:
    print(f"*** drawing samples for batch {existing_sample_counter + 1} for rater {rater} ***")
    
    # load the existing samples including samples that were created for the
    # current batch, but do not increase the batch counter
    existing_samples, _ = get_existing_samples(existing_sample_dir, verbose=True)

    # load the available pool of examples
    sample_pool = get_sample_pool(sample_pool_dir)

    # remove the existing samples from the available pool of examples
    df = sample_pool[~sample_pool["tweet_id"].isin(existing_samples["tweet_id"])].copy()
    print("remaining samples hate: {}".format(len(df[df["hate_label"] == "hate"])))
    print("remaining samples counter: {}".format(len(df[df["hate_label"] == "counter"])))
    print("remaining samples neutral: {}".format(len(df[df["hate_label"] == "neutral"])))
    print()
    print("remaining samples 2015: {}".format(len(df[df["year"] == 2015])))
    print("remaining samples 2016: {}".format(len(df[df["year"] == 2016])))
    print("remaining samples 2017: {}".format(len(df[df["year"] == 2017])))
    print("remaining samples 2018: {}".format(len(df[df["year"] == 2018])))
    print()
    del sample_pool

    # clean the text (necessary to embed it)
    df = clean_text(df)

    sample = create_sample(
        df, 
        year_sample_sizes=year_sample_sizes,
        years=years,
        classifier_src=classifier_src,
        classifier_model=classifier_model,
        embedding=embedding,
        ensemble_raters=["AS", "LT"],
        N=N_labelled_samples,
        class_label_dict=condensed_id_to_label,
        class_bias="minority",
        seed=42)
    
    # replace semicolons with commas, because we use semicolons as delimiters
    sample["text"] = sample["text"].apply(lambda x: x.replace(";", ","))
    
    # sanity check ensuring that no Tweets in the newly created sample are
    # already included in the existing samples
    assert len(set(sample["tweet_id"])\
               .intersection(set(existing_samples["tweet_id"]))) == 0
    
    # sanity check that there are no duplicated tweets in the sample
    assert len(sample) == len(sample["tweet_id"].drop_duplicates())
    
    # save the new sample to disk. The file name encodes the batch number and
    # rater
    sample_name = f"batch_{existing_sample_counter + 1}_{rater}.csv"
    sample = sample.drop(columns=["pred", "hate_label", "pred_label"])
    sample.to_csv(join(existing_sample_dir, sample_name), index=False)
    
    total_sample_size = sum(list(year_sample_sizes.values()))
    co_rater_sample_size = int(total_sample_size * co_rater_frac)
    co_rater_sample = sample.sample(n=co_rater_sample_size, random_state=seed)
    co_rater_sample_name = f"batch_{existing_sample_counter + 1}_{rater}_{co_raters[rater]}.csv"
    co_rater_sample.to_csv(join(existing_sample_dir, co_rater_sample_name),
                           index=False)
    
    print()
    print("****************************")
    print()

loading existing samples ['batch_1_LT_AS.csv', 'batch_2_LT_AS_AH.csv', 'batch_3_AH.csv', 'batch_3_AH_LT.csv', 'batch_3_AS.csv', 'batch_3_AS_AH.csv', 'batch_3_LT.csv', 'batch_3_LT_AH.csv', 'batch_4_AH.csv', 'batch_4_AH_LT.csv', 'batch_4_AS.csv', 'batch_4_AS_AH.csv', 'batch_4_LT.csv', 'batch_4_LT_AH.csv', 'batch_5_AH.csv', 'batch_5_AH_LT.csv', 'batch_5_AS.csv', 'batch_5_AS_AH.csv', 'batch_5_LT.csv', 'batch_5_LT_AH.csv']
*** drawing samples for batch 6 for rater AS ***
loading existing samples ['batch_1_LT_AS.csv', 'batch_2_LT_AS_AH.csv', 'batch_3_AH.csv', 'batch_3_AH_LT.csv', 'batch_3_AS.csv', 'batch_3_AS_AH.csv', 'batch_3_LT.csv', 'batch_3_LT_AH.csv', 'batch_4_AH.csv', 'batch_4_AH_LT.csv', 'batch_4_AS.csv', 'batch_4_AS_AH.csv', 'batch_4_LT.csv', 'batch_4_LT_AH.csv', 'batch_5_AH.csv', 'batch_5_AH_LT.csv', 'batch_5_AS.csv', 'batch_5_AS_AH.csv', 'batch_5_LT.csv', 'batch_5_LT_AH.csv']
existing sample spread to hate / counter / neutral:
hate       3589
neutral    2029
counter    1332
Name: l