<a href="https://colab.research.google.com/github/KatrinaLiqy/SocialBotDetection/blob/main/DataCombinationsAndRandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import os

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'botsdata/data_csv/final' #change this to whatever folder your code is in
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)

print(os.listdir(GOOGLE_DRIVE_PATH))

Mounted at /content/drive
['stream_account_scores.csv', 'botwiki-2019.csv', 'pronbots-2019.csv', 'celebrity-2019.csv', 'botometer-feedback-2019.csv', 'vendor-purchased-2019.csv', 'political-bots-2019.csv', 'gilani-2017.csv', 'verified-2019.csv', 'cresci-17.csv', 'varol-17.csv', 'cresci-rtbust-2019.csv', 'midterm-2018.csv', 'stream_users.csv', 'botwiki-verified.csv']


In [None]:
import sys
sys.path.append(GOOGLE_DRIVE_PATH)
import pandas as pd
import numpy as np

### 119 Datasets Combinations

In [None]:
# Seven training datasets
varol = pd.read_csv(GOOGLE_DRIVE_PATH + '/varol-17.csv')
cresci_17 = pd.read_csv(GOOGLE_DRIVE_PATH + '/cresci-17.csv')
pronbots = pd.read_csv(GOOGLE_DRIVE_PATH + '/pronbots-2019.csv')
celebrity = pd.read_csv(GOOGLE_DRIVE_PATH + '/celebrity-2019.csv')
vendor = pd.read_csv(GOOGLE_DRIVE_PATH + '/vendor-purchased-2019.csv')
botometer = pd.read_csv(GOOGLE_DRIVE_PATH + '/botometer-feedback-2019.csv')
political = pd.read_csv(GOOGLE_DRIVE_PATH + '/political-bots-2019.csv')

# Testing datasets
botwiki = pd.read_csv(GOOGLE_DRIVE_PATH + '/botwiki-verified.csv')
midterm = pd.read_csv(GOOGLE_DRIVE_PATH + '/midterm-2018.csv')
gilani = pd.read_csv(GOOGLE_DRIVE_PATH + '/gilani-2017.csv')
c_rtbust = pd.read_csv(GOOGLE_DRIVE_PATH + '/cresci-rtbust-2019.csv')

In [None]:
# 0. varol-icwsm - B733 H1495
# 1. cresci-17 - B7049 H2764
# 2. pronbots - Only Bots 17882
# 3. celebrity - Only Humans 5918
# 4. vendor-purchased - Only Bots 1087
# 5. botometer-feedback - B139 H380
# 6. political-bots - Only Bots 62

dataset_list = [varol, cresci_17, pronbots, celebrity, vendor, botometer, political] 

In [None]:
all_combinations = []
for i in range(1,128):
  combination = bin(i).replace('0b','')
  combination = combination.zfill(7)
  all_combinations.append(combination)

# Manually remove the 8 combinations that are only bots or humans
all_combinations.remove('0010101')
all_combinations.remove('0010100')
all_combinations.remove('0010001')
all_combinations.remove('0000101')
all_combinations.remove('0000001')
all_combinations.remove('0000100')
all_combinations.remove('0010000')
all_combinations.remove('0001000')

print(len(all_combinations))

119


In [None]:
all_combinations_data = [] # for real data
for combination in all_combinations:
  data = pd.DataFrame()
  for j in range(7):
    if combination[j] == '1':
      data = pd.concat([data,dataset_list[j]],axis=0,ignore_index=True)
  all_combinations_data.append(data)
  
print(len(all_combinations_data))

119


### Functions for Training & Evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from joblib import dump, load
from google.colab import files
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
import csv
from scipy.stats import randint
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
# Four testing sets features and labels splits

column_number = len(botwiki.columns)  # all data frame should have 21 columns. 
X_test_botwiki = botwiki.iloc[:, 0:column_number - 1]
X_test_midterm = midterm.iloc[:, 0:column_number - 1]
X_test_gilani = gilani.iloc[:, 0:column_number - 1]
X_test_rtbust = c_rtbust.iloc[:, 0:column_number - 1]

y_test_botwiki = botwiki.iloc[:, column_number - 1]
y_test_midterm = midterm.iloc[:, column_number - 1]
y_test_gilani = gilani.iloc[:, column_number - 1]
y_test_rtbust = c_rtbust.iloc[:, column_number - 1]


In [None]:
def train(all_combinations_data, model_path, random_state_value):
  cv_auc_list = []
  for i, data in enumerate(all_combinations_data):
    X_train = data.iloc[:, 0:20]
    y_train = data.iloc[:, 20]
  
    clf = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=random_state_value)
    print(i)
    clf.fit(X_train, y_train)
    dump(clf, model_path + all_combinations[i]+'.joblib')
  
    # Do cross validation on training datasets
    clf2 = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=random_state_value)
    ss = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
    cv_auc_scores = cross_val_score(clf2, X_train, y_train, cv=ss, scoring='roc_auc') 
    cv_auc_list.append(sum(cv_auc_scores)/len(cv_auc_scores))
  return cv_auc_list

In [None]:
def evaluation(cv_auc_list, result_path, results_file):
  #get the test streaming/botometer correlation data
  stream_df = pd.read_csv(GOOGLE_DRIVE_PATH + "/stream_users.csv")
  stream_test = stream_df.iloc[:, 1:]

  #get botometer predictions
  boto_df = pd.read_csv(GOOGLE_DRIVE_PATH + "/stream_account_scores.csv").drop("cap", axis=1)

  with open(result_path + results_file, "w") as csvfile:
    csv_writer = csv.writer(csvfile)

    csv_writer.writerow(["name", "botwiki-verified", "midterm-18", "gilani-17", "cresci-rtbust", "5-fold_cross-validation", "spearman_r"])

    for i, binary_str in enumerate(all_combinations):
      model = load(model_path + binary_str +'.joblib')

    
      #make predictions on the streaming twitter account data
      prediction = pd.DataFrame(model.predict_proba(stream_test), columns = ["human_prob", "bot_prob"]).drop("human_prob", axis=1)
      name_prediction = pd.concat([stream_df["screen_name"], prediction], axis=1)
      combined_data = boto_df.merge(name_prediction, on="screen_name")
    
      spearman_r = combined_data.corr(method="spearman")["overall"]["bot_prob"]
      
      #do cross domain validation on new datasets
      botwiki_roc = roc_auc_score(y_test_botwiki, model.predict_proba(X_test_botwiki)[:, -1])
      midterm_roc = roc_auc_score(y_test_midterm, model.predict_proba(X_test_midterm)[:, -1])
      gilani_roc = roc_auc_score(y_test_gilani, model.predict_proba(X_test_gilani)[:, -1])
      rtbust_roc = roc_auc_score(y_test_rtbust, model.predict_proba(X_test_rtbust)[:, -1])

      cross_validation_score = cv_auc_list[i]
      csv_writer.writerow([binary_str, botwiki_roc, midterm_roc, gilani_roc, rtbust_roc, cross_validation_score, spearman_r])


      print(f"Evaluated {binary_str}")

In [None]:
import json
import math

In [None]:
def save_params(all_combinations):
  binary_str = all_combinations[0]
  model = load(model_path + binary_str +'.joblib')

  params = model.get_params()
  print(params)

  json_file = os.path.join(result_path, "hyperparameters.json")

  with open(json_file, "w") as outfile:
    outfile.write(json.dumps(params))

### Training and Testing Function Calls

In [None]:
model_path = 'drive/My Drive/botsdata/models/base_shuffle_CV/'

In [None]:
result_path = 'drive/My Drive/botsdata/results/5_final_results/'

In [None]:
# trained the model and evaluated with random_state = 0,1,2,3,4 into results_csv
for i in range(1,5,1):
  cv_list = train(all_combinations_data, model_path, random_state_value=i)
  evaluation(cv_list, result_path, "results_" + str(i) + ".csv")
  save_params(all_combinations)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
Evaluated 0000010
Evaluated 0000011
Evaluated 0000110
Evaluated 0000111
Evaluated 0001001
Evaluated 0001010
Evaluated 0001011
Evaluated 0001100
Evaluated 0001101
Evaluated 0001110
Evaluated 0001111
Evaluated 0010010
Evaluated 0010011
Evaluated 0010110
Evaluated 0010111
Evaluated 0011000
Evaluated 0011001
Evaluated 0011010
Evaluated 0011011
Evaluated 0011100
Evaluated 0011101
Evaluated 0011110
Evaluated 0011111
Evaluated 0100000
Evaluated 0100001
Evaluated 0100010
Evaluated 0100011
Evaluated 0100100
Evaluated 0100101
Evaluated 0100110
Evaluated 0100111
Evaluated 0101000
Evaluated 0101001
Evaluated 0101010
Evaluated 0101011
Eval

In [None]:
result1 = pd.read_csv(result_path + "results_0.csv")
result2 = pd.read_csv(result_path + "results_1.csv")
result3 = pd.read_csv(result_path + "results_2.csv")
result4 = pd.read_csv(result_path + "results_3.csv")
result5 = pd.read_csv(result_path + "results_4.csv")

df_concat = pd.concat((result1,result2,result3))
df_combined = df_concat.groupby(df_concat.index).agg(['mean','count','std'])

# ci95_hi = []
# ci95_lo = []
# for i in df_combined.index:
#   m,c,s = df_combined.loc[i]
#   ci95_hi.append(m + 1.96*s/math.sqrt(c))
#   ci95_lo.append(m - 1.96*s/math.sqrt(c))
# df_combined['ci95_hi'] = ci95_hi
# df_combined['ci95_lo'] = ci95_lo

df_combined_mean = df_concat.groupby(df_concat.index).mean()
df_combined_mean.to_csv(result_path+"mean.csv", index=False)

df_combined_std = df_concat.groupby(df_concat.index).std()
df_combined_std = df_combined_std.drop(columns="name")
df_combined_std = df_combined_mean['name'].to_frame().join(df_combined_std)
df_combined_std.to_csv(result_path+"std.csv", index=False)