<a href="https://colab.research.google.com/github/KatrinaLiqy/SocialBotDetection/blob/main/DataCombinationsAndRandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd

dataset_folder = 'data_csv/final'

#this directory contains the csv with all the models' metrics, and all tables generated in this notebook will go here
results_path = "./results/5_final_results/"
models_path = "./models/5_final_results/"

### 119 Datasets Combinations

In [6]:
# Seven training datasets
varol = pd.read_csv(dataset_folder + '/varol-17.csv')
cresci_17 = pd.read_csv(dataset_folder + '/cresci-17.csv')
pronbots = pd.read_csv(dataset_folder + '/pronbots-2019.csv')
celebrity = pd.read_csv(dataset_folder + '/celebrity-2019.csv')
vendor = pd.read_csv(dataset_folder + '/vendor-purchased-2019.csv')
botometer = pd.read_csv(dataset_folder + '/botometer-feedback-2019.csv')
political = pd.read_csv(dataset_folder + '/political-bots-2019.csv')
# Testin
botwiki = pd.read_csv(dataset_folder + '/botwiki-verified.csv')
midterm = pd.read_csv(dataset_folder + '/midterm-2018.csv')
gilani = pd.read_csv(dataset_folder + '/gilani-2017.csv')
c_rtbust = pd.read_csv(dataset_folder + '/cresci-rtbust-2019.csv')

In [7]:
# 0. varol-icwsm - B733 H1495
# 1. cresci-17 - B7049 H2764
# 2. pronbots - Only Bots 17882
# 3. celebrity - Only Humans 5918
# 4. vendor-purchased - Only Bots 1087
# 5. botometer-feedback - B139 H380
# 6. political-bots - Only Bots 62

dataset_list = [varol, cresci_17, pronbots, celebrity, vendor, botometer, political] 

In [8]:
all_combinations = []
for i in range(1,128):
  combination = bin(i).replace('0b','')
  combination = combination.zfill(7)
  all_combinations.append(combination)

# Manually remove the 8 combinations that are only bots or humans
all_combinations.remove('0010101')
all_combinations.remove('0010100')
all_combinations.remove('0010001')
all_combinations.remove('0000101')
all_combinations.remove('0000001')
all_combinations.remove('0000100')
all_combinations.remove('0010000')
all_combinations.remove('0001000')

print(len(all_combinations))

119


In [9]:
all_combinations_data = [] # for storing all combined data
for combination in all_combinations:
  data = pd.DataFrame()
  for j in range(7):
    if combination[j] == '1':
      data = pd.concat([data,dataset_list[j]],axis=0,ignore_index=True)
  all_combinations_data.append(data)
  
print(len(all_combinations_data))

119


### Functions for Training & Evaluation

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from joblib import dump, load
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
import csv
from sklearn.model_selection import StratifiedShuffleSplit

In [11]:
# Four testing sets features and labels splits

column_number = len(botwiki.columns)  # all data frame should have 21 columns. 
X_test_botwiki = botwiki.iloc[:, 0:column_number - 1]
X_test_midterm = midterm.iloc[:, 0:column_number - 1]
X_test_gilani = gilani.iloc[:, 0:column_number - 1]
X_test_rtbust = c_rtbust.iloc[:, 0:column_number - 1]

y_test_botwiki = botwiki.iloc[:, column_number - 1]
y_test_midterm = midterm.iloc[:, column_number - 1]
y_test_gilani = gilani.iloc[:, column_number - 1]
y_test_rtbust = c_rtbust.iloc[:, column_number - 1]


In [12]:
def train(all_combinations_data, model_path, random_state_value):
  cv_auc_list = []
  for i, data in enumerate(all_combinations_data):
    X_train = data.iloc[:, 0:20]
    y_train = data.iloc[:, 20]
  
    clf = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=random_state_value)
    print(i)
    clf.fit(X_train, y_train)
    dump(clf, model_path + all_combinations[i]+'.joblib')
  
    # Do cross validation on training datasets
    clf2 = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=random_state_value)
    ss = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
    cv_auc_scores = cross_val_score(clf2, X_train, y_train, cv=ss, scoring='roc_auc') 
    cv_auc_list.append(sum(cv_auc_scores)/len(cv_auc_scores))
  return cv_auc_list

In [13]:
def evaluation(cv_auc_list, model_path, result_path, results_file):
  #get the test streaming/botometer correlation data
  stream_df = pd.read_csv(dataset_folder + "/stream_users.csv")
  stream_test = stream_df.iloc[:, 1:]

  #get botometer predictions
  boto_df = pd.read_csv(dataset_folder + "/stream_account_scores.csv").drop("cap", axis=1)

  with open(result_path + results_file, "w") as csvfile:
    csv_writer = csv.writer(csvfile)

    csv_writer.writerow(["name", "botwiki-verified", "midterm-18", "gilani-17", "cresci-rtbust", "5-fold_cross-validation", "spearman_r"])

    for i, binary_str in enumerate(all_combinations):
      model = load(model_path + binary_str +'.joblib')

    
      #make predictions on the streaming twitter account data
      prediction = pd.DataFrame(model.predict_proba(stream_test), columns = ["human_prob", "bot_prob"]).drop("human_prob", axis=1)
      name_prediction = pd.concat([stream_df["screen_name"], prediction], axis=1)
      combined_data = boto_df.merge(name_prediction, on="screen_name")
    
      spearman_r = combined_data.corr(method="spearman")["overall"]["bot_prob"]
      
      #do cross domain validation on new datasets
      botwiki_roc = roc_auc_score(y_test_botwiki, model.predict_proba(X_test_botwiki)[:, -1])
      midterm_roc = roc_auc_score(y_test_midterm, model.predict_proba(X_test_midterm)[:, -1])
      gilani_roc = roc_auc_score(y_test_gilani, model.predict_proba(X_test_gilani)[:, -1])
      rtbust_roc = roc_auc_score(y_test_rtbust, model.predict_proba(X_test_rtbust)[:, -1])

      cross_validation_score = cv_auc_list[i]
      csv_writer.writerow([binary_str, botwiki_roc, midterm_roc, gilani_roc, rtbust_roc, cross_validation_score, spearman_r])


      print(f"Evaluated {binary_str}")

In [15]:
import json
import os

In [16]:
def save_params(model_path, result_path, all_combinations):
  binary_str = all_combinations[0]
  model = load(model_path + binary_str +'.joblib')

  params = model.get_params()
  print(params)

  json_file = os.path.join(result_path, "hyperparameters.json")

  with open(json_file, "w") as outfile:
    outfile.write(json.dumps(params))

### Training and Testing Function Calls

In [None]:
# trained the model and evaluated with random_state = 0,1,2,3,4 into results_csv
for i in range(1,5,1):
  cv_list = train(all_combinations_data, models_path, random_state_value=i)
  evaluation(cv_list, models_path, results_path, "results_" + str(i) + ".csv")
  save_params(models_path, results_path, all_combinations)

In [None]:
result1 = pd.read_csv(results_path + "results_0.csv")
result2 = pd.read_csv(results_path + "results_1.csv")
result3 = pd.read_csv(results_path + "results_2.csv")
result4 = pd.read_csv(results_path + "results_3.csv")
result5 = pd.read_csv(results_path + "results_4.csv")

df_concat = pd.concat((result1,result2,result3))
df_combined = df_concat.groupby(df_concat.index).agg(['mean','count','std'])

df_combined_mean = df_concat.groupby(df_concat.index).mean()
df_combined_mean.to_csv(results_path+"mean.csv", index=False)

df_combined_std = df_concat.groupby(df_concat.index).std()
df_combined_std = df_combined_std.drop(columns="name")
df_combined_std = df_combined_mean['name'].to_frame().join(df_combined_std) # add the name column to the left
df_combined_std.to_csv(results_path+"std.csv", index=False)