# <u>Bot Detection</u>

# 1. Data Collection and Integration

## 1.1. Imports

In [None]:
import datetime
import warnings
import numpy as np
import pandas as pd
import os,glob,sys,nltk
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import KFold

## 1.2. Globals

In [None]:
DATASETS_DIR = 'Datasets'
SRC_DIR = 'src'
TWEETS_DIR = 'Tweets'
BOT_TWEETS_DIR = 'Bot_accounts_tweets'
GENUINE_TWEETS_DIR = 'Genuine_accounts_tweets'

In [None]:
# for organized printing only
class txt_format:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

## 1.3. Loading Data and Modules

In [None]:
# genuine users => 0 , bots => 1
cwd = os.getcwd()
dataset_dir = os.path.join(cwd, DATASETS_DIR)
os.chdir(dataset_dir)
# Loading users data
total_data = pd.DataFrame()
for File_Name in glob.glob("*_users.csv"):
    df = pd.read_csv(File_Name)
    if File_Name == 'genuine_accounts_users.csv':
        df['bot'] = 0
    elif File_Name == 'E13_users.csv' or File_Name == 'TFP_users.csv':
        df['bot'] = 0
        df['test_set_1'] = 0
        df['test_set_2'] = 0
    elif File_Name == 'fake_followers_users.csv':
        df['bot'] = 1
    elif 'social_spambots' in File_Name:
        df['bot'] = 1
    elif 'traditional_spambots' in File_Name:
        df['bot'] = 1
    else:
        print(File_Name)
    total_data = pd.concat([total_data,df], ignore_index = True, sort=False)

os.chdir(cwd)

### 1.3.1. Loading Tweets and Description

'run_full_process' is a binary variable that decides wheather running all notebooks cells will take a few hours. If marked with 1, will run:
* translation process using Yandex service - also limited by characters per day
* language detection process using Yandex service - also limited by characters per day
* BoW process
* finding most important words in description text field
* calculating the tweet features (not including the variation on Levenshtein distances. Takes some time)

otherwise, will skip it and use pre-made files.

'run_tweet_var_calculation' allows you to calculate the variance of Levenshtein Distance between tweets of a user. To do this, change value to 1. **Note:** This calculation takes hours

In [None]:
run_full_process = 0

In [None]:
run_tweet_var_calculation = 0

In [None]:
user_data_file_path = os.path.join(dataset_dir, 'tweet_features_filled1.csv')
user_data = pd.read_csv(user_data_file_path)
user_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
levenshtein_data_file_path = os.path.join(dataset_dir, 'tweet_var_data_full_with_threshold.csv')
levenshtein_data = pd.read_csv(levenshtein_data_file_path)

In [None]:
description_data_file_path = os.path.join(dataset_dir, 'description_data_set2.csv')
description_df = pd.read_csv(description_data_file_path)
if(run_full_process):
    description_df = description_df[['Unnamed: 0', 'id', 'lang', 'description', 'bot','test_set_1', 'test_set_2']]    

In [None]:
if(not run_full_process):
    count_vectors_df = pd.DataFrame()
    count_vectors_files_names = [file 
                                 for file in os.listdir(dataset_dir) 
                                 if os.path.isfile(os.path.join(dataset_dir, file)) and 
                                 'count_df_' in file
                                ]
    
    for count_vectors_file_name in count_vectors_files_names:
        count_vectors_data_file_path = os.path.join(dataset_dir, count_vectors_file_name)
        count_df = pd.read_csv(count_vectors_data_file_path)
        count_vectors_df = pd.concat([count_vectors_df,count_df], ignore_index = True)
    count_vectors_df.drop('Unnamed: 0.1', axis=1, inplace=True)

### 1.3.2.  Loading Modules

In [None]:
modules_path = os.path.join(os.getcwd(),SRC_DIR)
if modules_path not in sys.path:
    sys.path.append(modules_path)

## 1.4. Data Undestanding

In [None]:
# understanding data types
total_data.dtypes

In [None]:
#training size:
len(total_data.index)

In [None]:
total_data.head(10)

In [None]:
# # of basic user features (including target)
total_data.columns.size

### 1.4.1. Perliminary Data Visualization and Statistics

#### 1.4.1.1. Class Distribution

In [None]:
total_data['bot'].value_counts()

#### 1.4.1.2. Language Histogram

In [None]:
print ("-------------------------------")
print ("Histogram for 'lang'")
print ("-------------------------------")
print (total_data['lang'].value_counts())
print ("")

# 2. Data Preparation and Cleaning

## 2.1. Unique Values
* id, screen_name are unique - **different value for each record**
* contributors_enabled, follow_request_sent , following , notifications - **zero unique values** (NULL)
* verified, protected, profile_use_background_image, profile_background_tile, is_translator, geo_enabled, default_profile, default_profile_image - **single unique value**

In [None]:
total_data.nunique()

### Remove columns with zero unique values (NULL values only )
* contributors_enabled 
* follow_request_sent 
* following 
* notifications 

In [None]:
for col_name in total_data.columns:
    if total_data[col_name].nunique() < 1:
        print('removing ', col_name)
        total_data.drop(col_name, axis = 1, inplace = True)

### Removing irrelevant columns : 
* dataset (dataset name) 
* crawled_at (date the data was crawled)

In [None]:
total_data.drop(['dataset','crawled_at'], axis = 1, inplace = True)

## 2.2. NULL Values

Visualizing NULL count for each column

In [None]:
import Missing_Values_Lib

In [None]:
try:
    import missingno as msno
except ModuleNotFoundError as e:
    !{sys.executable} -m pip install missingno --user
    import missingno as msno       

In [None]:
msno.bar(total_data)

In the graph above, each number represents the non-null count for a certain column

### Total missing values

In [None]:
np.sum(np.sum(total_data.isnull()))

### Removing columns with more then 16,000 null values : 
* default_profile_image
* is_translator
* protected
* verified

In [None]:
threshold = 16000
for col_name in total_data.columns:
    if np.sum(total_data[col_name].isnull()) >= threshold:
        print('removing ', col_name)
        total_data.drop(col_name, axis = 1, inplace = True)

After understanding the data type in each column, we replace the Nulls with the appropriate value

In [None]:
Missing_Values_Lib.Fill_Missing(total_data)
np.sum(np.sum(total_data.isnull()))

This is a perliminary filling. More precise work will be done on feature engineering section.

# 3. Feature Engineering

## 3.1. Numeric Features

In [None]:
import General_Lib

In [None]:
# The following features are the numeric features from user data (before engineering)
numeric_features_list = ['favourites_count',
                 'followers_count',
                 'friends_count',
                 'listed_count',
                 'statuses_count',
                 'utc_offset',
                 'bot']

corr_mtx = General_Lib.Plot_Correlation_Matrix(total_data, numeric_features_list)

**No apparent connection between main numeric features to the target**<br>
**We can see there is a high correlation between:**
1. listed_count - followers count 
2. listed_count - friends count 
3. friends count - followers count

In [None]:
corr_mtx['bot']

**Each numeric feature is binned in order to get better correlation with the target feature on the following section. Features getting the highest correlation will replace current used features and will be added to the list below for future use:**

In [None]:
new_numeric_features = []

### 3.1.1. favourites_count

In [None]:
%matplotlib inline

sns.distplot(total_data[total_data.favourites_count<=150].favourites_count, bins = 10, kde = False)
plt.title('Histogram of Favourites Count')
plt.xlabel('Favourites')
plt.ylabel('Count')

**'0' has the highest frequency. We'll bin binarily if the value is 0 or not**

In [None]:
total_data['is_favourite'] = total_data['favourites_count'] > 0
print("new feature - target correlation is:",
      total_data['is_favourite'].corr(total_data['bot']))
print("old feature - target correlation is:",
        total_data['favourites_count'].corr(total_data['bot']))

In [None]:
new_numeric_features.append('is_favourite')

### 3.1.2. followers_count

In [None]:
%matplotlib inline

sns.distplot(total_data[total_data.followers_count<=800].followers_count, bins = 20, kde = False)
plt.title('Histogram of Followers Count')
plt.xlabel('Followers')
plt.ylabel('Count')

In [None]:
df_followers_count = total_data[total_data.followers_count<=300]
df_count = pd.DataFrame({'bot_followers': df_followers_count[df_followers_count.bot==1].followers_count,
                   'genuine_users_followers': df_followers_count[df_followers_count.bot==0].followers_count},
                   columns=['bot_followers', 'genuine_users_followers'])
plt.figure();
df_count.plot.hist(title = 'Bot Followers vs. Genuine Users Followers (<300)',bins= 10,alpha=0.5)  

In [None]:
print("Feature-target correlation:",total_data['followers_count'].corr(total_data['bot']))

**Binning to 5 equal width buckets**

In [None]:
bin_followers_count= General_Lib.Binning(total_data['followers_count'],5)
print("Feature-target correlation:",bin_followers_count.corr(total_data['bot']))

**Binning to 5 <u>un</u>equal width buckets**

In [None]:
total_data['bin_followers_count'] = General_Lib.Binning(total_data['followers_count'],
                                                        -1, 
                                                        bin_seq = [0,12,20,30,100,np.inf])

total_data.bin_followers_count = total_data.bin_followers_count.astype(int)
print("Feature-target correlation:",total_data['bin_followers_count'].corr(total_data['bot']))

**We chose the bins by choosing the binning that maximizes the correlation to the target.**

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning) 
#only the following block yielding futureWraning beacuse of future change in scipy stats behavior (no way to fix it)

In [None]:
%matplotlib inline

sns.distplot(total_data['bin_followers_count'], kde = False)
plt.title('Histogram of Followers Count Binning')
plt.xlabel('Followers Count Bins')
plt.ylabel('Count')

In [None]:
warnings.filterwarnings("always", category=FutureWarning)  #restoring the default mode of warnings

In [None]:
new_numeric_features.append('bin_followers_count')

### 3.1.3. friends_count

In [None]:
%matplotlib inline

sns.distplot(total_data[total_data.friends_count<=800].friends_count, bins = 20, kde = False)
plt.title('Histogram of Friends Count')
plt.xlabel('Friends')
plt.ylabel('Count')

In [None]:
df_friends_count = total_data[total_data.friends_count<=500]
df_count = pd.DataFrame({'bot_friends': df_friends_count[df_friends_count.bot==0].friends_count,
                   'genuine_users_friends': df_friends_count[df_friends_count.bot==1].friends_count},
                   columns=['bot_friends', 'genuine_users_friends'])
plt.figure();
df_count.plot.hist(title = 'Bot Friends vs. Genuine Users Friends (<800)',bins= 20,alpha=0.5)                         

In [None]:
print("Feature-target correlation:",total_data['friends_count'].corr(total_data['bot']))

**Binning to 20 equal width buckets**

In [None]:
bin_friends_count = General_Lib.Binning(total_data['friends_count'],20)
print("Feature-target correlation:",bin_friends_count.corr(total_data['bot']))

**Binning to 5 <u>un</u>equal width buckets**

In [None]:
total_data['bin_friends_count'] = General_Lib.Binning(total_data['friends_count'],
                                                      -1, 
                                                      bin_seq = [0,2,10,50,90,np.inf])
print("Feature-target correlation:",total_data['bin_friends_count'].corr(total_data['bot']))

In [None]:
total_data['bin_friends_count'].value_counts(dropna=False)

In [None]:
new_numeric_features.append('bin_friends_count')

### 3.1.4. listed_count

In [None]:
%matplotlib inline

sns.distplot(total_data[total_data.listed_count<=10].listed_count, bins = 10, kde = False)
plt.title('Histogram of Listed Count')
plt.xlabel('Listed')
plt.ylabel('Count')

In [None]:
print("Feature-target correlation:",total_data['listed_count'].corr(total_data['bot']))

**Binary binning if the value of listed_count feature is zero or not** 

In [None]:
total_data['is_listed_count'] = total_data['listed_count'] > 0
print("Feature-target correlation:",total_data['is_listed_count'].corr(total_data['bot']))

**Binning to 5 equal width buckets**

In [None]:
listed_count_binning = General_Lib.Binning(total_data['listed_count'],8)
print("Feature-target correlation:",listed_count_binning.corr(total_data['bot']))

**Binning to 8 <u>un</u>equal width buckets**

In [None]:
total_data['listed_count_binning'] = General_Lib.Binning(total_data['listed_count'],
                                                         -1, 
                                                         bin_seq = [0,2,10,50,200,300,500,1000,np.inf])
total_data.listed_count_binning = total_data.listed_count_binning.astype(int)
print("Feature-target correlation:",total_data['listed_count_binning'].corr(total_data['bot']))

In [None]:
new_numeric_features.append('is_listed_count')

### 3.1.5. statuses_count

In [None]:
%matplotlib inline

sns.distplot(total_data[total_data.statuses_count<=500].statuses_count, bins = 10, kde = False)
plt.title('Histogram of Statuses Count')
plt.xlabel('Status')
plt.ylabel('Count')

In [None]:
print("Feature-target correlation:",total_data['statuses_count'].corr(total_data['bot']))

**Binning to 3 unequal width buckets**

In [None]:
total_data['statuses_count_binning'] = General_Lib.Binning(total_data['statuses_count'],
                                                           -1, 
                                                           bin_seq = [0,80,300, np.inf])
total_data.statuses_count_binning = total_data.statuses_count_binning.astype(int)
print("Feature-target correlation:",total_data['statuses_count_binning'].corr(total_data['bot']))

**Binning to 50 equal width buckets**

In [None]:
statuses_count_binning = General_Lib.Binning(total_data['statuses_count'],50)
print("Feature-target correlation:",statuses_count_binning.corr(total_data['bot']))

**As we can see binning 3 buckets has given us great correlation to the target**

In [None]:
new_numeric_features.append('statuses_count_binning')

### 3.1.6. utc_offset

In [None]:
%matplotlib inline

sns.distplot(total_data[total_data.utc_offset<=20000].utc_offset, bins = 10, kde = False)
plt.title('Histogram of Utc Count')
plt.xlabel('utc')
plt.ylabel('Count')

In [None]:
df_utc_offset = total_data[total_data.utc_offset<=20000]
df_count = pd.DataFrame({'bot_utc_offset': df_utc_offset[df_utc_offset.bot==0].utc_offset,
                   'genuine_users_utc_offset': df_utc_offset[df_utc_offset.bot==1].utc_offset},
                   columns=['bot_utc_offset', 'genuine_users_utc_offset'])
plt.figure();
df_count.plot.hist(title = 'Bot UTC vs. Genuine Users UTC (<20000)',bins= 20,alpha=0.5)                         

In [None]:
print("Feature-target correlation:",total_data['utc_offset'].corr(total_data['bot']))

**Binning to 20 equal width buckets**

In [None]:
utc_offset_binning_1 = General_Lib.Binning(total_data['utc_offset'],20)
print("Feature-target correlation:",utc_offset_binning_1.corr(total_data['bot']))

**Binning to 3 uequal width buckets**

In [None]:
utc_offset_binning_2 = General_Lib.Binning(total_data['utc_offset'],
                                           -1, 
                                           bin_seq = [-30000, 0, 10000, 20000])
print("Feature-target correlation:",utc_offset_binning_2.corr(total_data['bot']))

**Binary binning if utc_offset is greater than zero or not**

In [None]:
total_data['is_utc_offset'] = total_data['utc_offset'] > 0
print("Feature-target correlation:",total_data['is_utc_offset'].corr(total_data['bot']))

**Binary binning has provided the best correlation to the target**

In [None]:
new_numeric_features.append('is_utc_offset')

### 3.1.7. New Numeric Features Correlation

Correlation matrix of high correlated features

In [None]:
new_numeric_features.append('bot')
corr_mtx = General_Lib.Plot_Correlation_Matrix(total_data, new_numeric_features)

In [None]:
corr_mtx['bot']

## 3.2. Date and Time

**Time and date features:** 
* created_at
* timestamp 
* updated 

We'll parse time and dates features to int features.

In [None]:
import Date_Parser_Lib

### 3.2.1. created_at feature

#### Created_at feature contains two date formats:
1. Day Month Day_in_month Hour:Minute:Seconds Time zone Year.
 * For example: Wed Jul 04 12:30:03 +0000 2007
2. Seconds from epoch
 * For example: 1183552203000L

**We will parse seconds from epoch format to the format described in bullet number 1. <br>
Then we will parse the updated feature to create new 7 numeric features:**
1. day of the week
2. month
3. day in month
4. hour
5. minute
6. second
7. year


In [None]:
feature_name = 'created_at'

**Check_For_Date_Format:** analyze the feature date format. <br>
<u>Return value: </u>
* seconds_from_epoch_data - samples contains seconds from epoch date format. <br>
* regular_date_format_data - all the remains samples.

In [None]:
seconds_from_epoch_data,regular_date_format_data = Date_Parser_Lib.Check_For_Date_Format(total_data, 
                                                                                         feature_name)

#####  parse seconds from epoch date format 
* ignoring time zone (+0000 on every row).
* Convert seconds from epoch to regular date format
* Example of seconds from epoch date format: 1183552203000L

In [None]:
seconds_from_epoch_data['created_at'].head(5)

**Convert seconds from epoch date format to regular format**

In [None]:
date_format_ceated_at = '%a %b %d %H:%M:%S +0000 %Y'
seconds_from_epoch_data[feature_name] = seconds_from_epoch_data[feature_name].apply(
                                        Date_Parser_Lib.convert_seconds_from_epoch_to_date_format,
                                        args = (date_format_ceated_at,))

seconds_from_epoch_data['created_at'].head(5)

**Concatenate the records**

In [None]:
total_data = pd.concat([seconds_from_epoch_data,regular_date_format_data])

**validate there is'nt any seconds from epoch format **

In [None]:
Date_Parser_Lib.Rows_Contain_Seconds_From_Epoch_Format(total_data,feature_name)

**Parse created_at column to create new numeric features**

In [None]:
new_features_names,features_corr = Date_Parser_Lib.Parse_Feature_and_Print_Corr(total_data, 
                                                                  feature_name, 
                                                                  'bot', 
                                                                  date_format_ceated_at)

**New features correlation**

In [None]:
features_corr['bot']

* **we cannot rely on the created year, as new bots created  all the time.**<br>
* **we will check the created_at_month feature<br>**
 * We assume maybe there is a connection between the created month to bot accounts <br>
* **There isn't a distinct connection between all the remains new features to the target feature.**

In [None]:
total_data[total_data['bot']==1]['created_at_month'].value_counts(dropna = False)

In [None]:
total_data[total_data['bot']==0]['created_at_month'].value_counts(dropna = False)

**February and March months are the most frequent months for bot accounts creation** <br>
**Binning according to those months**

In [None]:
total_data['is_february_march'] = (total_data['created_at_month'] == 2) | (total_data['created_at_month'] == 3)
print("Feature-target correlation:",total_data['is_february_march'].corr(total_data['bot']))

**Binning according to top 4 months for bot accounts creation**

In [None]:
top_bot_months = [1,2,3,6]
total_data['is_top_month'] = total_data.apply(lambda x: 
                                              1 if x['created_at_month'] in top_bot_months 
                                              else 0,
                                             axis = 1)
print("Feature-target correlation:",total_data['is_top_month'].corr(total_data['bot']))

**We managed to create new feature with correlation a little higher from the month feature. **

#### Remove created_at feature

In [None]:
total_data.drop([feature_name], axis = 1, inplace = True)

### 3.2.2. Updated column

**This column was added by the researchers and it is the same value at each dataset file.<br> 
We will remove this feature.**

In [None]:
total_data.drop(['updated'], axis = 1, inplace = True)

### 3.2.3. timestamp column

**This column was added by the researchers and isn't part of twitter's API.<br> 
We will remove this feature.**

In [None]:
total_data.drop(['timestamp'], axis = 1, inplace = True)

## 3.3. Textual User Features (without description)

 We devided the text features to 3 groups:
* Color features
* Location feature
* General text features

each group was treated in a slightly different way.

#### Text Features List

In [None]:
groups_types = total_data.columns.to_series().groupby(df.dtypes).groups
str_cols_names = [v 
                  for k, v in groups_types.items() 
                  if k.name == 'object'][0]

if 'profile_text_color' not in str_cols_names:
    str_cols_names = str_cols_names.append(pd.Index(['profile_text_color']))

print('{0}{1}{2}{3}{3}'.format(txt_format.BOLD,
                               txt_format.UNDERLINE,
                               'Text features:',
                               txt_format.END))

for col_name in str_cols_names:
    print(col_name)

### 3.3.1. General Text Features

For each general text feature we added 5 new numeric features:
* <u>Replace a string with unique value:</u> 
  * defined unique mapping from feature values to int and apply to the feature values.
* <u>Replace missing values by unique mapping by mode:</u>  
  * For the features created by the unique values, there are missing values (marked as enpty string). We will replace those values by mode. 
  * We split the mode replacement to two options: 
   * If all the nan values is of bot account: replace the missing values by the bot mode value of the feature.    
   * else replace by feature mode. 
* <u>Replace missing values by unique mapping by distribution:</u> 
 * same as for the mode case, only we replace the missing values from feature distribution. 
* <u>Binning by is most common:</u>  
 * For features we detected a single value that is the most common value, we created a new feature indicating if the sample contains the most common value or not.  
* <u>Replace a string with length:</u>
 * Replace each string with it's length.

In [None]:
import Text_Features_Lib


**Detect if all missing values is of bot account.**

In [None]:
def Is_All_Nan_of_Bot(df, feature_name):
    bot_nan_num = np.sum(df[df['bot'] == 1][feature_name]=='') 
    if bot_nan_num == 0:
        return False
    return np.sum(df[feature_name]=='') == bot_nan_num

**most common values dictionary**

In [None]:
features_most_common_values = {
    'lang': 'en',
    'time_zone': '',
    'profile_banner_url': '',
    'profile_background_image_url': 'http://a0.twimg.com/images/themes/theme1/bg.png',
    'profile_background_image_url_https': 'https://si0.twimg.com/images/themes/theme1/bg.png',
    'profile_background_color': 'C0DEED',
    'profile_link_color': '0084B4',
    'profile_sidebar_border_color': 'C0DEED',
    'profile_sidebar_fill_color': 'DDEEF6',
    'profile_text_color':'333333',
    'url': 'https://t.co/DByWt45HZj'
}

**General features list**

In [None]:
g_txt_features = [feature 
                for feature in str_cols_names 
                if 'color' not in feature
                 and feature != 'description']
for feature in g_txt_features:
    print(feature)

**Create new features**

In [None]:
total_data.drop('description', axis = 1, inplace = True)
str_cols_names = str_cols_names.drop('description')

for feature_name in g_txt_features:
    use_bot_mode = Is_All_Nan_of_Bot(total_data, feature_name)
    
    if feature_name == 'lang':
        #unite same lang
        total_data[feature_name] = total_data[feature_name].str.lower()
    
    Text_Features_Lib.Add_Numeric_Features_From_Str(total_data, 
                                                    feature_name, 
                                                    features_most_common_values, 
                                                    use_bot_mode)
    
    if feature_name != 'location':
        total_data.drop(feature_name, axis = 1, inplace = True)
        str_cols_names = str_cols_names.drop(feature_name)

### 3.3.2. Location Features

We created locations binning for the this feature, and assigned each location a unique number.
The bins were chosen by frequency of appearances of locations - U.S, Italy, Europe w.o Italy, Asia+Oceania , Nan and the rest of the world  

In [None]:
import Location_Lib

In [None]:
total_data['location_binning'] = total_data['location'].apply(Location_Lib.checkCountry)

In [None]:
print("Feature-target correlation:",total_data['location_binning'].corr(total_data['bot']))

In [None]:
total_data.drop('location', axis = 1, inplace = True)
str_cols_names = str_cols_names.drop('location')

### 3.3.3. Color Features
For each color feature we added 10 new numeric features:
* Creating 4 general numeric features from text features (mentioned on 3.3.1), except for the length feature
 * The color features contains 6 characters, hence the legth feature is meaningless.
* <u> Binning by main colors: </u>
 * We defined 12 main colors: black, white, blue, red, green, pink, brown, purple, grey, yellow, orange, turquoise. We mapped the main colors to int values. We performed the binning by calculating the nearest color (defined as the color with the minimum distance to the current color). 
* <u>Replace missing values in color binning by mode:</u>  
 * Same as in the general case
* <u>Replace missing values in color binning by distribution:</u>  
 * Same as in the general case
* <u> Binning by top 3 colors: </u>
 * After binning by main colors, we will perform more specific binning to top 3 colors only for each feature. The top 3 colors does not include the nan values, hence the total binning if of four colors (0 for the missing values). 
   * We tested several options for the top colors number, and discovered that 3 is the most effective number to choose. 
* <u>Replace missing values in top color binning by mode:</u>  
 * Same as in the general case
* <u>Replace missing values in top color binning by distribution:</u> 
 * Same as in the general case

In [None]:
import Color_Features_Lib

**Main colors dict**

In [None]:
main_colors = {
    (0,0,0): 1, # black
    (255,255,255): 2, # white
    (0,0,255): 3, # blue
    (255,0,0): 4, # red
    (0,255,0): 5, # green
    (255,192,203): 6, # pink
    (165,42,42): 7, # brown
    (128,0,128): 8, # purple
    (128,128,128): 9, # grey
    (255,255,0): 10, # yellow
    (255,165,0): 11, # orange
    (64,224,208): 12 # Turquoise
}

In [None]:
Color_Features_Lib.main_colors = main_colors

**Color features list**

In [None]:
color_features = [feature for feature in str_cols_names if 'color' in feature]
for feature in color_features: 
    print(feature)

**Create new features**

In [None]:
for feature_name in color_features:
    
    apply_str = False
    if feature_name == 'profile_link_color':
        apply_str = True
        
    Color_Features_Lib.Prepare_Color_Features(total_data, 
                                              feature_name, 
                                              features_most_common_values, 
                                              apply_str)
    
    total_data.drop(feature_name, axis = 1, inplace = True)
    str_cols_names = str_cols_names.drop(feature_name)

In [None]:
print(len(str_cols_names))

**All text features have been parsed**

In [None]:
from importlib import reload

### Correlation between new features to bot

In [None]:
for (key,value) in Text_Features_Lib.correlation_dict.items():
    print('{0}{1}{2}{3}{3}'.format(txt_format.BOLD,
                                   txt_format.UNDERLINE,
                                   key,
                                   txt_format.END))
    
    for (sub_key,sub_value) in value.items():
        print('{0} : {1}'.format(sub_key,sub_value))
    print('\n')

## 3.4. Tweet Features

The following blocks in comment describe how tweet_features_filled1.csv and tweet_var_data_full_with_threshold.csv were created

In [None]:
import Tweet_Features_Lib

For calculation of tweets var and the range var feature, change 'run_tweet_var_calculation' from the begining of the notebook parameter to 1.

### Calculating tweet_var and 500<var<750

In [None]:
if run_tweet_var_calculation: #data is read in the functions
    Tweet_Features_Lib.generate_all_tweets_datasets(DATASETS_DIR,TWEETS_DIR,
                                                   GENUINE_TWEETS_DIR,BOT_TWEETS_DIR)
    print('done generating files')
    tweet_var_df = Tweet_Features_Lib.run_tweet_var_calculation()

### Loading Tweets Data - for tweet-metadata features calculation

In [None]:
if run_full_process:
    import Tweet_Features_Lib
    ### loading the tweet data
    tweet_data = pd.DataFrame()
    tweets_dir =  os.path.join(os.getcwd(), DATASETS_DIR, TWEETS_DIR)
    genuine_tweets_files,bot_tweets_files = Tweet_Features_Lib.get_tweets_files_names(DATASETS_DIR, 
                                                                                      TWEETS_DIR, 
                                                                                      GENUINE_TWEETS_DIR, 
                                                                                      BOT_TWEETS_DIR
                                                                                     )
    
    
    
    genuine_tweets_files = [os.path.join(tweets_dir, GENUINE_TWEETS_DIR, file)
                           for file in genuine_tweets_files]   
    
    
    bot_tweets_files = [os.path.join(tweets_dir, BOT_TWEETS_DIR, file)
                           for file in bot_tweets_files] 
    
    tweets_datasets = bot_tweets_files + genuine_tweets_files
    for file_name in tweets_datasets:
        print('loading ' + file_name)


        df_t = pd.read_csv(file_name, usecols=['id','text','user_id','retweet_count',
                                             'favorite_count','num_hashtags','num_urls',
                                             'num_mentions','created_at'],
                        dtype={'id':str,'text':str,'user_is':str,'retweet_count':float,
                              'favorite_count':float,'num_hashtags':float,'num_urls':float,
                               'num_mentions':float,'created_at':str})
        
        if 'traditional_spambots_1_tweets_' in file_name:
            temp_date = df_t['created_at'].apply(Date_Parser_Lib.convert_seconds_from_epoch_to_date_format,
                                               args = (date_format_ceated_at,))
            df_t['created_at'] = pd.to_datetime(temp_date)
        else:
            df_t['created_at'] = pd.to_datetime(df_t['created_at'])
        
        if 'genuine' in file_name or 'E13' in file_name or 'TFP' in file_name: 
            df_t['bot'] = 0
        else:
            df_t['bot'] = 1
        tweet_data = pd.concat([tweet_data,df_t], ignore_index = True, sort=False)

### Calculating features based on tweet metadata

In [None]:
if run_full_process:
    ### check for nulls
    null_columns=tweet_data.columns[tweet_data.isnull().any()]
    tweet_data[null_columns].isnull().sum()

    ##remove tweets with no text
    tweet_data.dropna(subset=['text'],inplace=True)

    ##fill nan favorite count with 0
    tweet_data.fillna(value={'favorite_count':0},inplace=True)

    ### adding the proportion featres
    user_data_partial = Tweet_Features_Lib.calc_prop_tweet_features(tweet_data)
    user_data_partial.shape[0]

**since not all users in the dataset have tweets, we fill the nulls with the mean values of the correct type: human users get the mean values for humans and bots get the mean values for bots (done for each new feature). 
In the following brackets the mean values are computed**

In [None]:
if run_full_process:
    ### we want to fill nulls with mean according to bot/human:
    human_data = user_data_partial[user_data_partial['bot']==0].copy()
    bot_data = user_data_partial[user_data_partial['bot']==1].copy()
    #droping nulls
    cols = ['p_retweet','p_favorites','p_hashtags','p_urls','p_mentions','avg_tweets_per_hour']
    human_data.dropna(subset=cols,inplace=True)
    bot_data.dropna(subset=cols,inplace=True)

    #these will contain the maen value of each column
    human_dict = {}
    bot_dict = {}
    for col in cols:
        human_dict[col]=human_data[col].mean()
        bot_dict[col]=bot_data[col].mean()
    
    ### merge with total data
    temp_total = pd.merge(user_data_partial,total_data, how = 'outer')
    temp_total.dropna(subset=['id'], inplace=True) #remove lines without id, if exist
    np.sum(temp_total.isnull())

**fill nulls**

In [None]:
if run_full_process:
    ### filling nulls and saving
    human_data = temp_total[temp_total['bot']==0].copy()
    bot_data = temp_total[temp_total['bot']==1].copy()
    human_data.fillna(human_dict,inplace=True)
    bot_data.fillna(bot_dict,inplace=True)
    tot = pd.concat([human_data,bot_data],ignore_index=True)
    features=['id','p_retweet','p_favorites','p_hashtags','p_urls','p_mentions','avg_tweets_per_hour']
    tot.to_csv('tweet_features_filled.csv',columns=features) #add Datasets

In [None]:
levenshtein_data = levenshtein_data.drop(['Unnamed: 0'], axis=1)

In [None]:
# after filling with mean
df_tweet_var = pd.DataFrame({'bot_tweet_var':levenshtein_data[levenshtein_data.bot==1].tweet_var,
                             'genuine_users_tweet_var': levenshtein_data[levenshtein_data.bot==0].tweet_var})
plt.figure()
df_tweet_var.plot.hist(title='Bot vs. Genuine Users Aerage Variation in Tweets')

### Visualization of Tweet Variance for Human Users and the Bot Users Types

In [None]:
trad_samp_file = os.path.join(DATASETS_DIR,'traditional_spambots_1_sample_lev.csv')
trad_samp = pd.read_csv(trad_samp_file, usecols=['id','tweet_var'])

soc1_samp_file = os.path.join(DATASETS_DIR,'social_spambots_1_sample_lev.csv')
soc1_samp = pd.read_csv(soc1_samp_file, usecols=['id','tweet_var'])

soc2_samp_file = os.path.join(DATASETS_DIR,'social_spambots_2_sample_lev.csv')
soc2_samp = pd.read_csv(soc2_samp_file, usecols=['id','tweet_var'])

soc3_samp_file = os.path.join(DATASETS_DIR,'social_spambots_3_sample_lev.csv')
soc3_samp = pd.read_csv('Datasets/social_spambots_3_sample_lev.csv', usecols=['id','tweet_var'])

fake_samp_file = os.path.join(DATASETS_DIR,'fake_followers_sample_lev.csv')
fake_samp = pd.read_csv(fake_samp_file, usecols=['id','tweet_var'])

human_samp_file = os.path.join(DATASETS_DIR,'human_100_frac0.1_lev.csv')
human_samp = pd.read_csv(human_samp_file, usecols=['id','tweet_var'])

In [None]:
plt.figure()
trad_samp['tweet_var'].plot.hist(title='Variation in Tweets in Spambots')
fake_samp['tweet_var'].plot.hist()
soc1_samp['tweet_var'].plot.hist()
soc2_samp['tweet_var'].plot.hist()
soc3_samp['tweet_var'].plot.hist()

As we can see from the diagram above, most of the bot are out of the range 500 and 750 (roughly)

**statistical information of each bot type**

In [None]:
print('traditional spambots 1:')
print(trad_samp['tweet_var'].describe())
print(' ')
print('fake followers:')
fake_samp['tweet_var'].describe()
print(' ')
print('social spambots 1:')
print(soc1_samp['tweet_var'].describe())
print(' ')
print('social spambots 2:')
print(soc2_samp['tweet_var'].describe())
print(' ')
print('social spambots 3:')
print(soc3_samp['tweet_var'].describe())
print(' ')

In [None]:
plt.figure()
human_samp['tweet_var'].plot.hist(title='Variation in Tweets in Genuine Users')
human_samp['tweet_var'].describe()

**Adding tweet features to the total data**

In [None]:
total_data = pd.merge(user_data,total_data, how = 'outer')

In [None]:
total_data = pd.merge(levenshtein_data,total_data, how = 'outer')

In [None]:
total_data = total_data.dropna(subset=['id'])
if 'bot' in total_data.columns or 'tweet_var' in total_data.columns:
    total_data = total_data.dropna(subset=['bot','tweet_var'])

In [None]:
tweet_features = ['p_retweet','p_favorites','p_hashtags','p_urls','p_mentions','avg_tweets_per_hour','500<var<750']

### Tweets - Target Feature Correlation

In [None]:
corr_mtx = General_Lib.Plot_Correlation_Matrix(total_data, tweet_features + ['bot'])

In [None]:
corr_mtx['bot']

In [None]:
tweet_features_for_models = ['p_hashtags','p_mentions','500<var<750']

As we can see, p_hashtags,p_mentions and 500<var<750 are higly correlated to our target feature

## 3.5. Description Feature

Handling 'description' field created the need to translate descriptions to English in order to create BoW - a character limited process . In addition, a long preprocessing process was used in dictionary creation, and running it takes a long time.
Therefore in some sections below we used the 'run_full_process' variable, in order to control running time.

In [None]:
nltk.download('stopwords')

In [None]:
try:
    from yandex.Translater import Translater
except ModuleNotFoundError as e:
    #see if need user flag in university
    !{sys.executable} -m pip install yandex-translater --user 

try:
    from textblob import TextBlob, Word
except ModuleNotFoundError as e:
    !{sys.executable} -m pip install textblob --user
    
try:
    import nltk
except ModuleNotFoundError as e:
    !{sys.executable} -m pip install nltk --user

In [None]:
import Description

In [None]:
if(run_full_process):
    
    nltk.download('punkt')
    nltk.download('wordnet')
    # # # # translating description # # # # #
    description_df = Description.description_trans(description_df)
    # # # # detect language # # # # #
    description_df = Description.add_language_detection(description_df)
    # # # # creating BoW and extracting most important words from it# # # # #
    count_vectors_df, important_words = Description.find_important_words_from_bow(description_df)


### 3.5.1. Description Data

Description data includes original description column, translated description ('translation') and language detection ('description_lang'). We uploaded it, removed unnecessary columns added in csv creation and filled NA with empty string.

In [None]:
len(description_df)

In [None]:
if 'Unnamed: 0' in description_df.columns:
    description_df = description_df.drop(['Unnamed: 0'], axis=1)

In [None]:
description_df['description'].fillna('', inplace = True)
description_df['translation'].fillna('', inplace = True)

In [None]:
description_df.head(5)

### 3.5.2. Count Vectors

count vectors data frame includes count of each word in each description (each column is a word in the dictionary). In this section we removed unnecessary column that was created while exporting to csv.

In [None]:
count_vectors_df.head(5)

In [None]:
if 'Unnamed: 0' in count_vectors_df.columns:
    count_vectors_df = count_vectors_df.drop(['Unnamed: 0'], axis=1)

In [None]:
count_vectors_df.head(5)

### 3.5.3. Extracting Features From Description

#### Basic Feature Extraction

Fetures extracted from description are:

* Count Number of Words In a Description
* Count Number of characters In a Description
* Find Average Word Length in a Description
* Count Number of Stopwords in a Description
* Count Number of Hashtags in a Description
* Count Number of Numerics in a Description
* Count Number of Uppercase Words in a Description

In [None]:
def avg_word(sentence):
    words = sentence.split()
    if(len(words) == 0):
        return 0
    return (sum(len(word) for word in words)/len(words))

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
description_df['word_count'] = description_df['description'].apply(lambda x: len(str(x).split(" ")))
description_df['char_count'] = description_df['description'].str.len() ## this also includes spaces
description_df['avg_word'] = description_df['description'].apply(lambda x: avg_word(x))
description_df['stopwords'] = description_df['translation'].apply(lambda x: len([x for x in x.split() if x in stop]))
description_df['hashtags'] = description_df['description'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
description_df['numerics'] = description_df['description'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
description_df['upper'] = description_df['translation'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
description_df[['translation','word_count','char_count','avg_word','stopwords','hashtags','numerics','upper']].head()

#### Important Words

Important words is a list of 10 most important wotds that were extracted from BoW process

In [None]:
important_words = ['job','lover','love','student','life','follow','instagram','thing','italy','live']

In [None]:
important_words_df = count_vectors_df[np.concatenate((['main_id'], important_words), axis=None)]

In [None]:
important_words_df.head(10)

Merging word count vectors with basic description features

In [None]:
description_df = description_df.merge(important_words_df, left_on="id",right_on="main_id")

In [None]:
description_df.drop(["main_id"],axis=1,inplace=True)

In [None]:
description_df = description_df.rename(index=str, columns={"job": "job_appearances", "lover": "lover_appearances", "love": "love_appearances",
                                          "student": "student_appearances", "life": "life_appearances", "follow": "follow_appearances",
                                          "instagram": "instagram_appearances", "thing": "thing_appearances","italy": "italy_appearances", 
                                          "live": "live_appearances"})

#### Add 'contains_url'

If 'http' appeared in a description we mared a description as one that containes a URL

In [None]:
count_vectors_df['http'] = count_vectors_df['http'].reset_index(drop=True)
description_df['contains_url'] = 0
description_df['contains_url'] = description_df['contains_url'].reset_index(drop=True)
count_vectors_df['http'].index = description_df['contains_url'].index

In [None]:
description_df['contains_url'] = count_vectors_df['http'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
description_df['contains_url'].unique()

#### Add Language Difference

Using 'description_lang' which was added to 'description_df' with Yandex service of language detection. If 'description_lang' is different from an account's language ('lang') it was marked with '1', otherwise '0'.

In [None]:
description_df['different_lang'] = description_df['description_lang'] == description_df['lang']
description_df['different_lang'] = description_df['different_lang'].apply(lambda x: 1 if x else 0)

In [None]:
description_df['different_lang'].unique()

### 3.5.4. Description Features Visualization

**Calculating Pearson Correlation**

In [None]:
description_df.columns

In [None]:
features_for_correlation = ['word_count','char_count','avg_word','stopwords','hashtags','numerics',
                        'upper','job_appearances','lover_appearances','love_appearances',
                        'student_appearances','life_appearances', 'follow_appearances', 'instagram_appearances',
                        'thing_appearances', 'italy_appearances', 'live_appearances', 'contains_url',
                        'different_lang','bot']

corr_mtx = General_Lib.Plot_Correlation_Matrix(description_df,features_for_correlation)

In [None]:
corr_mtx['bot']

#### Important Features Distribution

In [None]:
print("Number of spambots containing the word 'job' is"
      ,description_df[description_df.bot==1]['job_appearances'].apply(lambda x: 1 if x > 0 else 0).sum() ,
      "and number of genuine accounts containing 'job' is",
      description_df[description_df.bot==0]['job_appearances'].apply(lambda x: 1 if x > 0 else 0).sum())

In [None]:
print("Number of spambots containing the word 'follow' is",
      description_df[description_df.bot==1]['follow_appearances'].apply(lambda x: 1 if x > 0 else 0).sum() ,
      "and number of genuine accounts containing 'follow' is",
      description_df[description_df.bot==0]['follow_appearances'].apply(lambda x: 1 if x > 0 else 0).sum())

In [None]:
print("Number of accounts with different language than declared is:", description_df[description_df.bot==0]['different_lang'].sum())

#### Merge Final Description Features with Total Data

In [None]:
description_features = ['id','word_count','char_count','avg_word','stopwords','hashtags','numerics',
                        'upper','job_appearances','lover_appearances','love_appearances',
                        'student_appearances','life_appearances', 'follow_appearances', 'instagram_appearances',
                        'thing_appearances', 'italy_appearances', 'live_appearances', 'contains_url', 'different_lang']

In [None]:
description_features_for_model = ['word_count','avg_word','stopwords','job_appearances','follow_appearances','different_lang']

In [None]:
description_df[description_features].head(5)

In [None]:
total_data = pd.merge(description_df[description_features],total_data, how = 'outer')

In [None]:
total_data.shape

In [None]:
total_data.dtypes

# 4. Data Visualization & Analysis

**Most visualization work is done on each part above**

## 4.1. Correlations

On the section below we choose user features with the highest correlation to target. For tweets and description features we chose high correlated features above.

### Target - Feature Correlations: User Features

In [None]:
methods = ['pearson', 'kendall', 'spearman']
features_list_to_drop = tweet_features + description_features + ['bot']
General_Lib.Print_Target_Features_Correlation(total_data, 
                                              methods, 
                                              'bot', 
                                              features_list_to_drop)

In [None]:
top_base_correlated_features = ['is_favourite', 
                                'statuses_count_binning',
                                'bin_followers_count', 
                                'profile_background_color_unique_dist',
                                'is_utc_offset',
                                'is_top_month',
                                'geo_enabled',
                                'time_zone_most_common',
                                'profile_banner_url_len']

In [None]:
total_features_to_use = top_base_correlated_features + tweet_features_for_models + description_features_for_model

# 5. Data Division

Dividing data by binary columns 'test_set_1' (we use it as validation set) and 'test_set_2' (we use it by test set). When marked with '1' an account belongs to a test set, otherwise, belongs to training set.

In [None]:
train_data, test_set_1, test_set_2 = General_Lib.train_test_division(total_data)

# 6. Model

**Dividing train data into X values and y values**

In [None]:
y_train = train_data['bot']
x_train = train_data[total_features_to_use]

## 6.1. Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree
DTclassifier = DecisionTreeClassifier(criterion='entropy', class_weight = {0:1,1:2})  
DTclassifier.fit(x_train, y_train)

## 6.2. Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
RFclassifier = RandomForestRegressor(n_estimators=20, random_state=0)  
RFclassifier.fit(x_train, y_train)   

In [None]:
feature_importances = pd.DataFrame(RFclassifier.feature_importances_,
                                   index = x_train.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)

In [None]:
feature_importances

## 6.3. Logistic Resression

In [None]:
from sklearn.linear_model import LogisticRegression
LRclassifier = LogisticRegression(solver='liblinear')
LRclassifier.fit(x_train, y_train)

In [None]:
LRclassifier.coef_ 

# 7. Model Evaluation

In [None]:
import Evaluation_Results_Lib

**Dividing validation and test data into X values and y values**

In [None]:
y_validation = test_set_1['bot']
x_validation = test_set_1[total_features_to_use]

In [None]:
y_test = test_set_2['bot']
x_test = test_set_2[total_features_to_use]

## 7.1. Decision Trees

### 7.1.1. Validation Set

In [None]:
y_pred_descisionTree = DTclassifier.predict(x_validation)  
y_pred_descisionTree_probs = DTclassifier.predict_proba(x_validation)

In [None]:
Evaluation_Results_Lib.print_results(y_validation, 
                                     y_pred_descisionTree,  
                                     classifier_name = 'Decision_Tree')

### 7.1.2. Test Set

In [None]:
y_pred_descisionTree_test = DTclassifier.predict(x_test)  
y_pred_descisionTree_probs_test = DTclassifier.predict_proba(x_test)

In [None]:
Evaluation_Results_Lib.print_results(y_test, 
                                     y_pred_descisionTree_test, 
                                     classifier_name = 'Decision_Tree')

## 7.2. Random forest

### 7.2.1. Validation Set

In [None]:
# Random forest
y_pred_forest = RFclassifier.predict(x_validation)
y_pred_forest_probs = y_pred_forest

In [None]:
y_pred_forest = y_pred_forest >=0.5
y_pred_forest = y_pred_forest.astype(int)

In [None]:
np.unique(y_pred_forest)

In [None]:
Evaluation_Results_Lib.print_results(y_validation, 
                                     y_pred_forest, 
                                     classifier_name = 'Random_Forest')

### 7.2.2. Test Set

In [None]:
# Random forest
y_pred_forest_test = RFclassifier.predict(x_test)
y_pred_forest_probs_test = y_pred_forest_test

In [None]:
y_pred_forest_test = y_pred_forest_test >=0.5
y_pred_forest_test = y_pred_forest_test.astype(int)

In [None]:
Evaluation_Results_Lib.print_results(y_test, 
                                     y_pred_forest_test,  
                                     classifier_name = 'Random_Forest')

## 7.3. Logistic regression

### 7.3.1. Validation Set

In [None]:
y_pred_lr = LRclassifier.predict(x_validation)
y_pred_lr_probs = LRclassifier.predict_proba(x_validation)

In [None]:
Evaluation_Results_Lib.print_results(y_validation, 
                                     y_pred_lr, 
                                     classifier_name = 'Logistic_Regression')

### 7.3.2. Test Set

In [None]:
y_pred_lr_test = LRclassifier.predict(x_test)
y_pred_lr_probs_test = LRclassifier.predict_proba(x_test)

In [None]:
Evaluation_Results_Lib.print_results(y_test, 
                                     y_pred_lr_test, 
                                     classifier_name = 'Logistic_Regression')

## 7.4. Collective Model

This model classifies by the majority in all models

### 7.4.1. Validation Set

In [None]:
res_dict = {'Decision_Trees':y_pred_descisionTree, 
            'Random_Forest':y_pred_forest,
            'LG':y_pred_lr
           }

results = pd.DataFrame(data = res_dict)
results['Collective_Model'] = results.mode(axis = 1)[0]

In [None]:
Evaluation_Results_Lib.print_results(y_validation, 
                                     results['Collective_Model'],  
                                     classifier_name = 'Collective_Model')

### 7.4.2. Test Set

In [None]:
res_dict_test = {'Decision_Trees':y_pred_descisionTree_test, 
                 'Random_Forest':y_pred_forest_test,
                 'LG':y_pred_lr_test
                }

results_test = pd.DataFrame(data = res_dict_test)
results_test['Collective_Model'] = results_test.mode(axis = 1)[0]

In [None]:
Evaluation_Results_Lib.print_results(y_test, 
                                     results_test['Collective_Model'],  
                                     classifier_name = 'Collective_Model')

## 7.5. ROC\AUC

### 7.5.1 Validation Set 

In [None]:
Evaluation_Results_Lib.Plot_Metrics(y_validation,
                                    y_pred_descisionTree_probs[:,1], 
                                    'Descision_Tree')

In [None]:
Evaluation_Results_Lib.Plot_Metrics(y_validation,
                                    y_pred_forest_probs, 
                                    'Random_Forest')

In [None]:
Evaluation_Results_Lib.Plot_Metrics(y_validation,
                                    y_pred_lr_probs[:,1], 
                                    'Logistic_Regression')

### 7.5.2. Test Set 

In [None]:
Evaluation_Results_Lib.Plot_Metrics(y_test,
                                    y_pred_descisionTree_probs_test[:,1], 
                                    'Descision_Tree')

In [None]:
Evaluation_Results_Lib.Plot_Metrics(y_test,
                                    y_pred_forest_probs_test, 
                                    'Random_Forest')

In [None]:
Evaluation_Results_Lib.Plot_Metrics(y_test,
                                    y_pred_lr_probs_test[:,1], 
                                    'Logistic_Regression')

# 7.6. Robustness Test : K-Fold For the Best Model

Cross-validation with the model that yields the best results : **Collective Model**

In [None]:
kfold = KFold(n_splits=10, shuffle = True)
rfc = RandomForestRegressor(n_estimators=20, random_state=0)
lrc = LogisticRegression(solver='liblinear')
dtc = DecisionTreeClassifier(criterion='entropy', class_weight = {0:1,1:2})
for train_indices, test_indices in kfold.split(x_train):
    rfc.fit(x_train.iloc[train_indices], y_train.iloc[train_indices])
    lrc.fit(x_train.iloc[train_indices], y_train.iloc[train_indices])
    dtc.fit(x_train.iloc[train_indices], y_train.iloc[train_indices])
    y_pred_rf = rfc.predict(x_train.iloc[test_indices])
    y_pred_rf = y_pred_rf >=0.5
    y_pred_rf = y_pred_rf.astype(int)
    y_pred_lr = lrc.predict(x_train.iloc[test_indices])
    y_pred_dt = dtc.predict(x_train.iloc[test_indices])
    r_dict = {'Decision_Trees':y_pred_dt, 
            'Random_Forest':y_pred_rf,
            'LG':y_pred_lr
           }
    r = pd.DataFrame(data = r_dict)
    r['Collective_Model'] = r.mode(axis = 1)[0]
    Evaluation_Results_Lib.print_results(y_train.iloc[test_indices], r['Collective_Model'], 
                                         classifier_name = 'Collective Model')
    print("------------------------------------------------------------------------")

As seen above, results are consistent for each fold, hence the best model is robust. 