# Label twitter accounts

This notebook exemplifies how we labelled users based on the predicted tweet labels from the supervised learning classification. We ran this code on all three datasets.

In [None]:
# load modules 
import pandas as pd
import numpy as np
import os
import re

In [None]:
# set working directory
os.chdir(r'C:\Users\maril\Documents\20-21 KU\block 4\DM\twitter\network_analysis\automated_network\labelled_datasets')

In [None]:
### load the datasets and join them 

# load manual lables (from AL) and prepare to merge 
dataM = pd.read_csv(r'pl_labels.csv', lineterminator='\r', dtype={'id':str})
dataM = dataM.drop(['Unnamed: 0'], axis = 1)

# load manual labels (from testing clf) and prepare to merge 
dataMT = pd.read_csv(r'pl_labels_done.csv', lineterminator='\r', dtype={'id':str})
dataMT = dataMT.drop(['Unnamed: 0', 'prediction', 'evaluation'], axis = 1)
dataMT.rename(columns = {'target':'label'}, inplace = True)

# load the predictions and prepare to merge 
dataP = pd.read_csv(r'pl_labels_pred.csv', lineterminator='\r', dtype={'id':str})
dataP = dataP.drop(['Unnamed: 0'], axis = 1)
dataP.rename(columns = {'prediction':'label'}, inplace = True)

# merge the df's
dfs=[dataM,dataMT,dataP]
df = pd.concat(dfs, axis=0, join='outer', ignore_index=True)

# Find duplicates (if you wanna have a look at them, remove the hashtags)
#duplicates = df[df.duplicated(keep='last')]
#print(duplicates) 

# drop duplicates 
df=df.drop_duplicates(subset="id")
df = df.drop(['index'], axis = 1)

In [None]:
# print the length of the dataframe
print(len(df))
df.tail()

In [None]:
# change working directory
os.chdir(r'C:\Users\maril\Documents\20-21 KU\block 4\DM\twitter')

In [None]:
# all info to all the retweets
df_rt = pd.read_csv('final_data_prepare1\pl_all_text.csv', dtype={'id':str})
df_rt = df_rt[df_rt['is_retweet'] == 1]
df_rt.head()

In [None]:
# add labels to this dataframe
rt_labels = []
c=0

for text in df_rt['text']:
    c += 1
    tup = tuple(df.loc[df['text'].str.contains(re.escape(text))==True]['label'])
    
    try:
        rt_lab = tup[0]
        rt_labels.append(rt_lab)
    
    except IndexError:
        rt_labels.append('error')
        print(tup)
        
    print(c)

In [None]:
# add the list back to the df
df_rt['label'] = rt_labels
df_rt.head()

In [None]:
# merge the dataframes back together
df_final = pd.concat([df_rt, df], join='outer', ignore_index=True)
df_final.head()

### Pick labels by majority

In [None]:
### make decision rule for vaxx/anti-vaxx/neutral/trash
group_df = pd.DataFrame(df_final.groupby(['user','label'])['text'].count()).unstack()
group_df.columns = group_df.columns.droplevel(level=0)
group_df.head()

In [None]:
# list to store the labels in
labels = []

# we iterate through the rows of the topic_docs dataframe
for row in group_df.iterrows():
    
    # use the .idxmax() method which returns the index (= topic) of the
    # topic with the highest probability; we add the index to the list
    labels.append(row[1].idxmax())

In [None]:
# add the users and the topics together in a dataframe
user_labels = pd.DataFrame(group_df.index)
user_labels['labels'] = labels

# check the results
print(user_labels.shape)
user_labels.head()

In [None]:
# save dataframe
user_labels.to_csv(r'network_analysis\automated_network\pl_labelled_users.csv')