In [3]:
import os,sys
sys.path.append("../") 
from germanhass.ReplyTrees.ReplyTreeWalker import ReplyTreeWalker
from germanhass.DBCode.HassDBAdapter import HassDBAdapter
from tqdm import tqdm
import pandas as pd
import numpy as np
from os.path import join



In [4]:
db = HassDBAdapter()
node_cut_off = 50

# Functions

In [5]:
def count_nodes(node,count):
    count +=1
    return count

def print_nodes(node):
    print(node)

def build_dict(node,parameters):
    parameters[1]["year"].append(node["created_at"][-4:])
    parameters[1]["tree_id"].append(parameters[0])
    parameters[1]["tweet_id"].append(node["_id"])
    parameters[1]["hate_score"].append(node['scores']['hate'])
    parameters[1]["counter_score"].append(node['scores']['counter'])
    parameters[1]["text"].append(node["full_text"].replace("\n","").replace(",",""))
    return parameters

def remove_root_nodes(all_tweets):
    df = pd.DataFrame(all_tweets)
    #Remove Root Nodes 
    df=df[df.tree_id!=df.tweet_id]
    return df

def stable_user(username):
    if username.lower() in ["diezeit","derspiegel","spiegelonline",
                             "faznet", "morgenmagazin","tagesschau",
                             "tagesthemen","zdfheute","annewilltalk",
                            "augstein","dunjahayali","janboehm",
                            "jkasek", "maischberger","nicolediekmann",
                            "cem_oezdemir", "c_lindner", "goeringeckardt",
                            "heikomaas", "olafscholz", "regsprecher", 
                            "renatekuenast"]:
        return True
    else: 
        return False


def create_sample(hate_df, counter_df, neutral_df, sample_size=10, 
                  neutral_sample_size=1, seed=None):
    frames = []
    for year in [2015, 2016, 2017, 2018]:
        frames += [
            get_tweets_by_year(hate_df, year=year, sample_size=sample_size, seed=seed),
            get_tweets_by_year(counter_df, year=year, sample_size=sample_size, seed=seed),
            get_tweets_by_year(neutral_df, year=year, sample_size=neutral_sample_size, seed=seed)
        ]
    
    df = pd.concat(frames)
    return(df)


def get_tweets_by_year(df, year=2015, sample_size=None, seed=None):
    df = df[df.year==year]
    if sample_size:
        return df.sample(n=sample_size, random_state=seed)
    else:
        return df

def get_hate_tweets_df(df,sample_size=None, threshold=0.8):
    hate_df = df[df.hate_score >= threshold]
    return hate_df
def get_counter_tweets_df(df,sample_size=None, threshold=0.8):
    counter_df = df[df.counter_score >= threshold]
    return counter_df
def get_neutral_tweets_df(df, sample_size=None, 
                          threshold1=0.44, threshold2=0.55):
    neutral_df = df[df.hate_score >= threshold1]
    neutral_df = neutral_df[neutral_df.hate_score <= threshold2]

    return neutral_df

# Build dataframes

In [6]:
all_tweets = {"year":[],
              "tree_id":[],
              "tweet_id":[],
              "hate_score":[],
              "counter_score":[],
              "text":[]
              }

total = db.ReplyTree_coll.count_documents({})
for tree in tqdm(db.ReplyTree_coll.find({}),total=total):
    _id  = tree["_id"]
    RT = ReplyTreeWalker(tree)
    num_nodes =RT.walk_w_parameters(count_nodes,parameters=0)
    if num_nodes >= node_cut_off:
        if stable_user(tree["screen_name"]):
            
            #If you want to save the tree. 
            #with open("../results/colored_trees/"+_id+".json", 'w') as outfile:
            #    json.dump(RT.doc, outfile)
            #RT.walk(print_nodes)
            #print(parameters[0])
            parameters = RT.walk_w_parameters(build_dict,None,parameters=[_id,all_tweets])
            all_tweets=parameters[1]
        #else:
        #    print("Skipping",tree["screen_name"])

#Remove the root nodes from the sample
df = remove_root_nodes(all_tweets)

#Split the dataframes by tweet type
hate_df = get_hate_tweets_df(df)
counter_df = get_counter_tweets_df(df)
neutral_df = get_neutral_tweets_df(df)



100%|█████████████████████████████████████████████████████████| 204544/204544 [00:29<00:00, 6969.98it/s]


In [10]:
# only select the most extreme Tweets for the sampling
# see also thresholds in the get_hate(), get_counter() and get_neutral()
# functions
df['label'] = np.nan
df.loc[df['hate_score'] >= 0.8, 'label'] = 'hate'
df.loc[df['counter_score'] >= 0.8, 'label'] = 'counter'
df.loc[(df['hate_score'] >= 0.44) & (df['hate_score'] <= 0.55), 'label'] = 'neutral'
df['label'].value_counts()

hate       80537
neutral    59149
counter    22237
Name: label, dtype: int64

In [12]:
print(f"fraction of labelled tweets: \
    {(len(hate_df) + len(counter_df) + len(neutral_df)) / len(df)}")

fraction of labelled tweets:     0.45488716523908385


In [38]:
dst = "/home/jana/Projects/CSS_reconquista_internet/analysis/data/tree_samples/data_split_in_classes"
hate_df.to_csv(join(dst, "hate.csv"), index=False)
counter_df.to_csv(join(dst, "counter.csv"), index=False)
neutral_df.to_csv(join(dst, "neutral.csv"), index=False)

# Build sample

## Sampling principles:
* 25% of samples from each of the four years (2015, 2016, 2017, 2018)
* 40% of samples hatespeech, 40% of samples counterspeech, 20% of samples neutral speech

In [6]:
check_existing_samples = True
seed = 42 # note: 1_tree_sample.csv was sampled by Joshua without a seed
existing_sample_dir = "/home/jana/Projects/CSS_reconquista_internet/analysis/data/tree_samples/samples"

In [8]:
if check_existing_samples:
    existing_sample_files = os.listdir(existing_sample_dir)
    existing_sample_files.sort()
    existing_sample_counter = int(existing_sample_files[-1].split("_")[1])
    existing_samples = pd.DataFrame()
    for f in existing_sample_files:
        tmp = pd.read_csv(join(existing_sample_dir, f))
        existing_samples = pd.concat([existing_samples, tmp])
        
existing_samples['label'] = np.nan
existing_samples.loc[existing_samples['hate_score'] >= 0.8, 'label'] = 'hate'
existing_samples.loc[existing_samples['counter_score'] >= 0.8, 'label'] = 'counter'
existing_samples.loc[(existing_samples['hate_score'] >= 0.44) & \
                     (existing_samples['hate_score'] <= 0.55), 'label'] = 'neutral'

In [10]:
existing_samples['label'].value_counts()

hate       800
counter    800
neutral    400
Name: label, dtype: int64

In [11]:
existing_samples["year"].value_counts()

2015    500
2016    500
2017    500
2018    500
Name: year, dtype: int64

In [13]:
# load the available pool of examples
src = "/home/jana/Projects/CSS_reconquista_internet/analysis/data/tree_samples/data_split_in_classes"
hate_df = pd.read_csv(join(src, "hate.csv"))
counter_df = pd.read_csv(join(src, "counter.csv"))
neutral_df = pd.read_csv(join(src, "neutral.csv"))

print(f"sample pool hate: {len(hate_df)}")
print(f"sample pool counter: {len(counter_df)}")
print(f"sample pool neutral: {len(neutral_df)}")

sample pool hate: 80537
sample pool counter: 22237
sample pool neutral: 59149


In [14]:
# remove the existing samples from the available pool of examples
hate_df = hate_df[~hate_df["tweet_id"].isin(existing_samples["tweet_id"])]
counter_df = counter_df[~counter_df["tweet_id"].isin(existing_samples["tweet_id"])]
neutral_df = neutral_df[~neutral_df["tweet_id"].isin(existing_samples["tweet_id"])]

print(f"remaining samples hate: {len(hate_df)}")
print(f"remaining samples counter: {len(counter_df)}")
print(f"remaining samples neutral: {len(neutral_df)}")

remaining samples hate: 79779
remaining samples counter: 21379
remaining samples neutral: 58851


In [15]:
hate_df["year"].value_counts()

2018    56304
2017    17500
2016     4175
2019     1052
2015      499
2014      152
2013       97
Name: year, dtype: int64

In [16]:
counter_df["year"].value_counts()

2018    15096
2017     3942
2016     1593
2019      519
2015      133
2014       71
2013       25
Name: year, dtype: int64

In [17]:
neutral_df["year"].value_counts()

2018    36896
2017    14967
2016     4766
2015      946
2019      896
2014      233
2013      147
Name: year, dtype: int64

In [92]:
sample = create_sample(hate_df, counter_df, neutral_df, sample_size=100,
                             neutral_sample_size=50, seed=seed)

assert len(set(sample["tweet_id"]).intersection(set(existing_samples["tweet_id"]))) == 0
sample.to_csv(join(existing_sample_dir, 
                   f"batch_{existing_sample_counter + 1}.csv"), index=False)