# Modeling

In [134]:
# This file will train the models.
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle
from sklearn.feature_selection import SelectFromModel

from statistics import multimode
from cmath import nan
import numpy as np


import seaborn as sns
pd.set_option("display.max_rows", None, "display.max_columns", None)

from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import cross_val_score


# Functions

In [135]:
def models(X_train,Y_train):
      
  #Using Logistic Regression Algorithm to the Training Set
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state = 0)
  log.fit(X_train, Y_train)  
  
  #Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algorithm
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
  tree.fit(X_train, Y_train)

  #Using RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier()
  forest.fit(X_train, Y_train)
  
  # Niave
  from sklearn.naive_bayes import GaussianNB
  naive = GaussianNB()
  naive.fit(X_train, Y_train)

  #linearSVM
  from sklearn.svm import LinearSVC
  svm = LinearSVC()
  svm.fit(X_train,Y_train)



  #print model accuracy on the training data.
  print('[0]Logistic Regression Training Accuracy:', log.score(X_train, Y_train))
  print('[1]Decsion Tree Classifier Training Accuracy:', tree.score(X_train, Y_train))
  print('[2]Random Forest Classifier Training Accuracy:', forest.score(X_train, Y_train))
  print('[3]naive Classifier Training Accuracy:', naive.score(X_train, Y_train))
  print('[4]linearSVM Classifier Training Accuracy:', svm.score(X_train, Y_train))

  
  return log, tree, forest,naive,svm

In [136]:
def testSet(set):
    test = set.loc[set.Protest == -1]
    test.drop('Protest', axis=1, inplace = True)
    train = set.loc[set.Protest != -1]
    
    num_features = 14
    X = train.iloc[:, 1:num_features].values 
    Y = train.iloc[:, 0].values
    
    # Split the dataset into 80% Training set and 20% Testing set
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

    #Get and train all of the models
    model = models(X_train,Y_train)

    #Print Confusion Matrices
    for i in range(len(model)):
        cm = confusion_matrix(Y_test, model[i].predict(X_test)) 
        #extracting TN, FP, FN, TP
        TN, FP, FN, TP = confusion_matrix(Y_test, model[i].predict(X_test)).ravel()
        print('Model[{}] Testing Accuracy = "{} !"'.format(i,  (TP + TN) / (TP + TN + FN + FP)))

In [137]:
def reccommended_features(X_train,Y_train):
    forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    select_forest = SelectFromModel(estimator=forest)
    new_X = select_forest.fit(X,Y) 
    selected_feat= X.columns[(select_forest.get_support())]
    len(selected_feat)

    print(selected_feat)

In [138]:
def get_grid(X_train, Y_train):
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV
    import numpy as np

    max_features_range = np.arange(1,5,1)
    n_estimators_range = np.arange(10,180,10)
    random_state_range = np.arange(0,5,1)

    param_grid = dict(max_features=max_features_range, 
                    n_estimators=n_estimators_range)

    # Inputting random state here gives better results than when inputting later. 
    rf = RandomForestClassifier(random_state = 0, criterion = 'entropy')
    grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
    grid.fit(X_train, Y_train)
    print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))
    return(grid)

In [139]:
def grid_plot(grid):
    import pandas as pd
    grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
    grid_contour = grid_results.groupby(['max_features','n_estimators']).mean()
    grid_reset = grid_contour.reset_index()
    grid_reset.columns = ['max_features', 'n_estimators',  'Accuracy']
    grid_pivot = grid_reset.pivot('max_features', 'n_estimators')
    x = grid_pivot.columns.levels[1].values
    y = grid_pivot.index.values
    z = grid_pivot.values
    import plotly.graph_objects as go
    # X and Y axes labels
    layout = go.Layout(
                xaxis=go.layout.XAxis(
                title=go.layout.xaxis.Title(
                text='n_estimators')
                ),
                yaxis=go.layout.YAxis(
                title=go.layout.yaxis.Title(
                text='max_features') 
                ) )

    fig = go.Figure(data= [go.Surface(z=z, y=y, x=x)], layout=layout )
    fig.update_layout(title='Hyperparameter tuning',
                    scene = dict(
                        xaxis_title='n_estimators',
                        yaxis_title='max_features',
                        zaxis_title='Accuracy'),
                    autosize=False,
                    width=800, height=800,
                    margin=dict(l=65, r=50, b=65, t=90))
    fig.show()

# Main

In [140]:
training = pd.read_csv('DATA/training_data.csv')
training = training.drop('Unnamed: 0', axis = 1)
training = training[['Protest', 'NuTweets', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8',
       'd9', 'd10', 'average', 'likes', 'followers', 'retweets', 'replies',
       'sub', 'pol']]

training = training.loc[((training['Protest']== 1) & (training['NuTweets'] > 20) )| (training['Protest']==0)]
training.head(1)

Unnamed: 0,Protest,NuTweets,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,average,likes,followers,retweets,replies,sub,pol
0,1,55,0.142857,0.071429,0.071429,0.642857,0.857143,1.0,0.428571,0.571429,0.142857,0.142857,5.7,0.563636,76095.436364,1.781818,0.236364,0.392344,0.136253


# Testing

In [141]:
testSet(training)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[0]Logistic Regression Training Accuracy: 0.7959183673469388
[1]Decsion Tree Classifier Training Accuracy: 1.0
[2]Random Forest Classifier Training Accuracy: 1.0
[3]naive Classifier Training Accuracy: 0.7551020408163265
[4]linearSVM Classifier Training Accuracy: 0.7687074829931972
Model[0] Testing Accuracy = "0.6486486486486487 !"
Model[1] Testing Accuracy = "0.8378378378378378 !"
Model[2] Testing Accuracy = "0.8108108108108109 !"
Model[3] Testing Accuracy = "0.7837837837837838 !"
Model[4] Testing Accuracy = "0.8108108108108109 !"



Liblinear failed to converge, increase the number of iterations.



# Suggestion maker

In [142]:
training = training.loc[((training['Protest']== 1) & (training['NuTweets'] > 20) )| (training['Protest']==0)]


In [143]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [144]:
X = training.drop('Protest',axis = 1)
Y = training['Protest']

In [145]:
reccommended_features(X,Y)

Index(['NuTweets', 'average', 'followers', 'retweets', 'sub'], dtype='object')


# Testing Different Features

In [146]:
subs = []

# 1
temp = training[['Protest', 'NuTweets', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8',
       'd9', 'd10', 'average', 'likes', 'followers', 'retweets', 'replies',
       'sub', 'pol']]
subs.append(temp)
# 2
temp = training[['Protest', 'NuTweets', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8',
       'd9', 'd10', 'average']]
subs.append(temp)
# 3
temp = training[['Protest', 'NuTweets',  'average', 'likes', 'followers', 'retweets', 'replies',
       'sub', 'pol']]
subs.append(temp)
# 4
temp = training[['Protest', 'NuTweets', 'd7', 'average']]
subs.append(temp)
# 5
temp = training[['Protest', 'NuTweets',  'd3', 'd4', 'd5', 'd6', 'd7', 'average']]
subs.append(temp)
# 6
temp = training[['Protest', 'NuTweets', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8',
       'd9', 'd10', 'average', 'likes', 'followers', 'retweets','sub', 'pol']]
subs.append(temp)


In [147]:
rf = RandomForestClassifier(max_features=4, n_estimators=100, random_state = 0)
count = 0
for sub in subs:
    X_train, X_test, Y_train, Y_test = train_test_split(sub.drop('Protest',axis=1), training['Protest'], test_size = 0.2, random_state = 13)
    rf.fit(X_train, Y_train)
    rf.score(X_test, Y_test)
    Y_pred = rf.predict(X_test)
    count += 1
    print('Feature secelction [%i] '%count, accuracy_score(Y_pred, Y_test))

Feature secelction [1]  0.8108108108108109
Feature secelction [2]  0.7837837837837838
Feature secelction [3]  0.8108108108108109
Feature secelction [4]  0.7297297297297297
Feature secelction [5]  0.7837837837837838
Feature secelction [6]  0.7837837837837838


### Feature Analysis
From the Above results we notice that selections [1] and [2] have the highest accuracy scores.</br>
There fore we will use feature selection [1] for the Hyperparam analysis

# Hyperparams

In [148]:
training.head(1)

Unnamed: 0,Protest,NuTweets,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,average,likes,followers,retweets,replies,sub,pol
0,1,55,0.142857,0.071429,0.071429,0.642857,0.857143,1.0,0.428571,0.571429,0.142857,0.142857,5.7,0.563636,76095.436364,1.781818,0.236364,0.392344,0.136253


In [149]:
# X and Y as defined above
# Split the dataset into 80% Training set and 20% Testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(training.drop('Protest',axis=1), training['Protest'], test_size = 0.2, random_state = 0)

In [150]:
rf = RandomForestClassifier(max_features=4, n_estimators=100, random_state = 0)
rf.fit(X_train, Y_train)
rf.score(X_test, Y_test)
Y_pred = rf.predict(X_test)
print(accuracy_score(Y_pred, Y_test))
print(rf.score(X_test,Y_test))

0.8108108108108109
0.8108108108108109


In [151]:
grid = get_grid(X_train,Y_train)

The best parameters are {'max_features': 2, 'n_estimators': 20} with a score of 0.90


In [152]:
grid_plot(grid)

# Model Analysis

In [None]:
cross 

# Creating the Fully trained model. 

In [155]:
X = training.drop('Protest',axis=1)
Y = training['Protest']

In [156]:
randomforest = RandomForestClassifier(n_estimators = 160, max_features = 3, random_state = 0, criterion = 'entropy')
randomforest = randomforest.fit(X,Y)

In [157]:
def save_model(model, filename):
    pickle.dump(model, open('DATA/'+filename, 'wb'))

def load_model(filename):
    loaded_model = pickle.load(open('DATA/'+filename, 'rb'))
    return(loaded_model)



In [158]:
save_model(randomforest,'RF')