In [62]:
#@title Everything in Part 1 copied over

!pip install -U -q PyDrive
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import defaultdict

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam, RMSprop
from keras.utils import plot_model

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

stockData = drive.CreateFile({'id':"1Gtf_20aW4H81JRwjvdKozlF49sXpZzID"}) 
stockData.GetContentFile('stock_data.csv')
cleaned_data = pd.read_csv('stock_data.csv')
cleaned_data.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Text,Sentiment
0,kicker on my watchlist xide tit soq pnk cpw bp...,1
1,user aap movie return for the fea geed indicat...,1
2,user i d be afraid to short amzn they are look...,1
3,mnta over,1
4,oi over,1


In [63]:
X = cleaned_data['Text']
y = cleaned_data['Sentiment']

vec = CountVectorizer(max_features=4000)
X = vec.fit_transform(X).toarray()

pca = PCA(n_components=256)
X = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Combining Everything Together

Now that we have optimized every model individually, let's see how they look together:

In [65]:
modelList = []
modelList.append(("KNN",KNeighborsClassifier(n_neighbors=30)))
modelList.append(("Linear Regression",RidgeClassifier(alpha=0)))
modelList.append(("Logistic Regression",LogisticRegression(C=0.45)))
modelList.append(("Neural Network", MLPClassifier(hidden_layer_sizes=(5, 5), activation='logistic', alpha=0.01, learning_rate_init=0.001)))
modelList.append(("Naive Bayes",BernoulliNB()))
modelList.append(("Gaussian Discriminant Analysis",GaussianNB()))
modelList.append(("SVM",SVC(C=1.25)))
modelList.append(("Decision Trees",DecisionTreeClassifier(criterion = "entropy", min_samples_split = 8)))

ls = []

for name, model in modelList:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    ls.append(y_pred)
    print("{} Accuracy: {}".format(name,accuracy_score(y_test,y_pred)))

KNN Accuracy: 0.7152717860224331
Linear Regression Accuracy: 0.7540983606557377
Logistic Regression Accuracy: 0.7575496117342536
Neural Network Accuracy: 0.7661777394305436
Naive Bayes Accuracy: 0.6850733390854185
Gaussian Discriminant Analysis Accuracy: 0.6721311475409836
SVM Accuracy: 0.7730802415875755
Decision Trees Accuracy: 0.6479723899913719


Which answers our first question: Which model does best? We put them in a table so it looks good:



Model | Accuracy (varies)
--- | --- 
SVM | 0.7730 
Neural Network | 0.7662
Logistic Regression | 0.7575
Linear Regression | 0.7541 
KNN | 0.7153
Naive Bayes | 0.6851
Gaussian Discriminant Analysis | 0.6721
Decision Trees | 0.6480

Now to answer our second question: Is there some way to put them together to get an even better accuracy? As in, can we combine the output and obtain something better? Well, that's just another classification algorithm, so let's just do everything again!

In [66]:
'''
We put the outputs of all the models together in one dataframe
'''

df = pd.DataFrame(np.transpose(ls))
df['Sentiment'] = cleaned_data['Sentiment']
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,Sentiment
0,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1
2,-1,1,1,-1,1,1,-1,-1,1
3,1,1,1,1,-1,1,1,-1,1
4,1,-1,-1,-1,-1,-1,-1,-1,1


In [68]:
'''
We run the algorithms again with this new dataset
'''
X2 = df.iloc[:,:7]
y2 = df.iloc[:,7]

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=40)

modelList2 = []
modelList2.append(("KNN",KNeighborsClassifier()))
modelList2.append(("Linear Regression",RidgeClassifier()))
modelList2.append(("Logistic Regression",LogisticRegression()))
modelList2.append(("Neural Network",MLPClassifier()))
modelList2.append(("Naive Bayes",BernoulliNB()))
modelList2.append(("Gaussian Discriminant Analysis",GaussianNB()))
modelList2.append(("SVM",SVC()))
modelList2.append(("Decision Trees",DecisionTreeClassifier()))

ls = []

for name, model in modelList:
    model.fit(X2_train,y2_train)
    y2_pred = model.predict(X2_test)
    ls.append(y2_pred)
    print("{} Accuracy: {}".format(name,accuracy_score(y2_test,y2_pred)))

KNN Accuracy: 0.7068965517241379
Linear Regression Accuracy: 0.6681034482758621
Logistic Regression Accuracy: 0.6637931034482759
Neural Network Accuracy: 0.7068965517241379
Naive Bayes Accuracy: 0.6939655172413793
Gaussian Discriminant Analysis Accuracy: 0.6896551724137931
SVM Accuracy: 0.6896551724137931
Decision Trees Accuracy: 0.6982758620689655


Where we realize that the highest accuracy we have is still lower than the SVM above. However, we are not convinced this is the case, so we shall apply both models to another scenario to see which performs better.