In [22]:
import os,sys
from germanhass.ReplyTrees.ReplyTreeWalker import ReplyTreeWalker
from germanhass.DBCode.HassDBAdapter import HassDBAdapter
from tqdm import tqdm
import pandas as pd
import numpy as np

In [9]:
db = HassDBAdapter()
node_cut_off = 50

In [13]:
def count_nodes(node,count):
    count +=1
    return count

def print_nodes(node):
    print(node)

def build_dict(node,parameters):
    parameters[1]["year"].append(node["created_at"][-4:])
    parameters[1]["tree_id"].append(parameters[0])
    parameters[1]["tweet_id"].append(node["_id"])
    parameters[1]["hate_score"].append(node['scores']['hate'])
    parameters[1]["counter_score"].append(node['scores']['counter'])
    parameters[1]["text"].append(node["full_text"].replace("\n","").replace(",",""))
    return parameters

def remove_root_nodes(all_tweets):
    df = pd.DataFrame(all_tweets)
    #Remove Root Nodes 
    df=df[df.tree_id!=df.tweet_id]
    return df

def stable_user(username):
    if username.lower() in ["diezeit","derspiegel","spiegelonline",
                             "faznet", "morgenmagazin","tagesschau",
                             "tagesthemen","zdfheute","annewilltalk",
                            "augstein","dunjahayali","janboehm",
                            "jkasek", "maischberger","nicolediekmann",
                            "cem_oezdemir", "c_lindner", "goeringeckardt",
                            "heikomaas", "olafscholz", "regsprecher", 
                            "renatekuenast"]:
        return True
    else: 
        return False


def create_sample(hate_df, counter_df, neutral_df, sample_size=10, 
                  neutral_sample_size=1, seed=None):
    frames = []
    for year in ["2015","2016","2017","2018"]:
        frames += [
            get_tweets_by_year(hate_df, year=year, sample_size=sample_size, seed=seed),
            get_tweets_by_year(counter_df, year=year, sample_size=sample_size, seed=seed),
            get_tweets_by_year(neutral_df, year=year, sample_size=neutral_sample_size, seed=seed)
        ]
    
    df = pd.concat(frames)
    return(df)


def get_tweets_by_year(df, year=2015, sample_size=None, seed=None):
    df = df[df.year==year]
    if sample_size:
        return df.sample(sample_size, random_state=seed)
    else:
        return df

def get_hate_tweets_df(df,sample_size=None, threshold=0.8):
    hate_df = df[df.hate_score >= threshold]
    return hate_df
def get_counter_tweets_df(df,sample_size=None, threshold=0.8):
    counter_df = df[df.counter_score >= threshold]
    return counter_df
def get_neutral_tweets_df(df, sample_size=None, 
                          threshold1=0.44, threshold2=0.55):
    neutral_df = df[df.hate_score >= threshold1]
    neutral_df = neutral_df[neutral_df.hate_score <= threshold2]

    return neutral_df

In [14]:
all_tweets = {"year":[],
              "tree_id":[],
              "tweet_id":[],
              "hate_score":[],
              "counter_score":[],
              "text":[]
              }

total = db.ReplyTree_coll.count_documents({})
for tree in tqdm(db.ReplyTree_coll.find({}),total=total):
    _id  = tree["_id"]
    RT = ReplyTreeWalker(tree)
    num_nodes =RT.walk_w_parameters(count_nodes,parameters=0)
    if num_nodes >= node_cut_off:
        if stable_user(tree["screen_name"]):
            
            #If you want to save the tree. 
            #with open("../results/colored_trees/"+_id+".json", 'w') as outfile:
            #    json.dump(RT.doc, outfile)
            #RT.walk(print_nodes)
            #print(parameters[0])
            parameters = RT.walk_w_parameters(build_dict,None,parameters=[_id,all_tweets])
            all_tweets=parameters[1]
        #else:
        #    print("Skipping",tree["screen_name"])

#Remove the root nodes from the sample
df = remove_root_nodes(all_tweets)

#Split the dataframes by tweet type
hate_df = get_hate_tweets_df(df)
counter_df = get_counter_tweets_df(df)
neutral_df = get_neutral_tweets_df(df)



100%|█████████████████████████████████████████████████████████| 204544/204544 [00:26<00:00, 7721.88it/s]


In [25]:
df.head(2)

Unnamed: 0,year,tree_id,tweet_id,hate_score,counter_score,text,label
1,2018,1000007502136860672,1000027660238671872,0.93581,0.06419,+++ACHUNG Falschmeldung des BRD Staatsfunks+++,
2,2018,1000007502136860672,1000351861331955712,0.974365,0.025635,Tuet Buße! Und immer wieder diser Ärger darübe...,


In [27]:
df['label'] = np.nan
df.loc[df['hate_score'] >= 0.8, 'label'] = 'hate'
df.loc[df['counter_score'] >= 0.8, 'label'] = 'counter'
df.loc[(df['hate_score'] >= 0.44) & (df['hate_score'] <= 0.55), 'label'] = 'neutral'

In [29]:
df['label'].value_counts()

hate       80537
neutral    59149
counter    22237
Name: label, dtype: int64

In [30]:
len(hate_df)

80537

In [32]:
len(counter_df)

22237

In [33]:
len(neutral_df)

59149

In [34]:
len(df)

355963

In [37]:
(len(hate_df) + len(counter_df) + len(neutral_df)) / len(df)

0.45488716523908385

In [19]:
sample = create_sample(hate_df, counter_df, neutral_df ,sample_size=100,
                             neutral_sample_size=50, seed=1)
#final_sample.to_csv("test_w_New_db.csv",index=False)

In [20]:
sample

Unnamed: 0,year,tree_id,tweet_id,hate_score,counter_score,text
170312,2015,655342271462666240,655355959603306496,0.820311,0.179689,@HonigumdenBart @faznet @HeikoMaas 2/2genauso ...
166671,2015,555056951931850753,555060252610424833,0.896558,0.103442,@SPIEGELONLINE Hat Ihm wohl der Maas geflüstert?
168501,2015,614858643348062208,614877294985891840,0.814365,0.185635,@DJanecek @W_SK @RenateKuenast @sven_kindler D...
166829,2015,557150869133983744,557198522282704896,0.871440,0.128560,@cem_oezdemir Warum? Darf dort nur staatstrage...
169887,2015,641847826041970689,642738330933792768,0.830976,0.169024,@tagesschau Die können sich selbst helfen: Put...
...,...,...,...,...,...,...
86976,2018,1040610669543010304,1041121937416744960,0.523125,0.476875,Vielen Dank
74922,2018,1037080812053778432,1037210378781028353,0.484187,0.515813,multichannelmarketing
355490,2018,996345814942396416,996350864863318016,0.463835,0.536165,Geht echt nicht
344989,2018,986552735070937088,986631672358670336,0.524306,0.475694,Endlich mal mit dem BlaBla aufhören und dagege...
