In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
import matplotlib.pyplot as plt
import pickle
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sn
#from sklearn import datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import cv2
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from scipy.stats import norm
import random
from sklearn.pipeline import Pipeline
import pandasql as psql
import tensorflow as tf
from tensorflow import keras

df = pd.read_csv("/kaggle/input/stockmarket-sentiment-dataset/stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [13]:
df.Sentiment.value_counts()

 1    3685
-1    2106
Name: Sentiment, dtype: int64

In [19]:
import re
document = []

for index, row in df.iterrows():
    document.append((re.sub(r"[^a-zA-Z]"," ",row["Text"]), row["Sentiment"]))

In [20]:
document[:2]

[('Kickers on my watchlist XIDE TIT SOQ PNK CPW BPZ AJ  trade method   or method    see prev posts',
  1),
 ('user  AAP MOVIE      return for the FEA GEED indicator just    trades for the year   AWESOME   ',
  1)]

In [22]:
from nltk.tokenize import word_tokenize

parsed_document = []

for text,sentiment in document:
    parsed_document.append((word_tokenize(text), sentiment))

parsed_document[:1]

[(['Kickers',
   'on',
   'my',
   'watchlist',
   'XIDE',
   'TIT',
   'SOQ',
   'PNK',
   'CPW',
   'BPZ',
   'AJ',
   'trade',
   'method',
   'or',
   'method',
   'see',
   'prev',
   'posts'],
  1)]

In [23]:
document = parsed_document

In [24]:
import nltk
from nltk.corpus import stopwords
stopWords = stopwords.words("english")

parsed_document = []

for words, sentiment in document:
    parsed_document.append(([word for word in words if word not in stopWords], sentiment))
parsed_document[:1]

[(['Kickers',
   'watchlist',
   'XIDE',
   'TIT',
   'SOQ',
   'PNK',
   'CPW',
   'BPZ',
   'AJ',
   'trade',
   'method',
   'method',
   'see',
   'prev',
   'posts'],
  1)]

In [26]:
document = parsed_document

In [27]:
parsed_document = []

for words, sentiment in document:
    parsed_document.append(([word.lower() for word in words if word], sentiment))
parsed_document[:1]

[(['kickers',
   'watchlist',
   'xide',
   'tit',
   'soq',
   'pnk',
   'cpw',
   'bpz',
   'aj',
   'trade',
   'method',
   'method',
   'see',
   'prev',
   'posts'],
  1)]

In [28]:
document = parsed_document

In [29]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

parsed_document = []

for words, sentiment in document:
    parsed_document.append(([ps.stem(word) for word in words if word], sentiment))
parsed_document[:1]

[(['kicker',
   'watchlist',
   'xide',
   'tit',
   'soq',
   'pnk',
   'cpw',
   'bpz',
   'aj',
   'trade',
   'method',
   'method',
   'see',
   'prev',
   'post'],
  1)]

In [32]:
document = parsed_document

In [33]:
all_words = []

for words, sentiment in document:
    for word in words:
        all_words.append(word)

all_words[:15]

['kicker',
 'watchlist',
 'xide',
 'tit',
 'soq',
 'pnk',
 'cpw',
 'bpz',
 'aj',
 'trade',
 'method',
 'method',
 'see',
 'prev',
 'post']

In [39]:
from nltk import FreqDist
freDistWords = FreqDist(all_words)
freDistWords.most_common(15)

[('aap', 929),
 ('co', 711),
 ('http', 696),
 ('user', 648),
 ('i', 559),
 ('short', 522),
 ('day', 385),
 ('stock', 372),
 ('today', 347),
 ('like', 334),
 ('look', 327),
 ('volum', 308),
 ('market', 291),
 ('buy', 290),
 ('long', 282)]

In [43]:
len(freDistWords)

7273

In [47]:
for i,j in freDistWords.most_common(15):
    print(i)

aap
co
http
user
i
short
day
stock
today
like
look
volum
market
buy
long


In [55]:
lmt = 5000
wordFeatures = []

for i,j in freDistWords.most_common(lmt):
    wordFeatures.append(i)

In [56]:
wordFeatures[:15]

['aap',
 'co',
 'http',
 'user',
 'i',
 'short',
 'day',
 'stock',
 'today',
 'like',
 'look',
 'volum',
 'market',
 'buy',
 'long']

In [66]:
def getFeatures(document):
    document = set(document)
    features = {}

    for word in wordFeatures:
        features[word] = (word in document)

    return features


In [70]:
feature_set = [(getFeatures(document),sentiment) for document, sentiment in document]

In [71]:
len(feature_set)

5791

In [73]:
train_set, test_set = feature_set[:4500], feature_set[4500:]

In [75]:
from nltk.classify.scikitlearn import SklearnClassifier

modelLogisticRegression = SklearnClassifier(LogisticRegression())
modelLogisticRegression.train(test_set)
nltk.classify.accuracy(modelLogisticRegression, test_set)

0.9868319132455461

In [76]:
modelDecisionTreeClassifier = SklearnClassifier(DecisionTreeClassifier())
modelDecisionTreeClassifier.train(test_set)
nltk.classify.accuracy(modelDecisionTreeClassifier, test_set)

1.0

In [77]:
modelRandomForestClassifier = SklearnClassifier(RandomForestClassifier())
modelRandomForestClassifier.train(test_set)
nltk.classify.accuracy(modelRandomForestClassifier, test_set)

1.0

In [78]:
modelSVC = SklearnClassifier(svm.SVC())
modelSVC.train(test_set)
nltk.classify.accuracy(modelSVC, test_set)

0.9821843532145623

In [80]:
modelMultinomialNB = SklearnClassifier(MultinomialNB())
modelMultinomialNB.train(test_set)
nltk.classify.accuracy(modelMultinomialNB, test_set)

0.919442292796282

In [81]:
modelBernoulliNB = SklearnClassifier(BernoulliNB())
modelBernoulliNB.train(test_set)
nltk.classify.accuracy(modelBernoulliNB, test_set)

0.9085979860573199

In [82]:
from nltk import ClassifierI
from scipy.stats import mode

class VoteClassifier(ClassifierI):
    def __init__(self, classifiers):
        self.classifiers = classifiers

    def classify(self,features):
        votes = []

        for c in self.classifiers:
            votes.append(c.classify(features))

        return mode(votes)[0]

    def confidence(self,features):
        votes = []

        for c in self.classifiers:
            votes.append(c.classify(features))

        cnt = votes.count(mode(votes)[0])
        return (cnt/len(votes))*100

In [84]:
voteClassifier = VoteClassifier([modelRandomForestClassifier, modelDecisionTreeClassifier])

In [87]:
nltk.classify.accuracy(voteClassifier, test_set)

array([1.])

In [88]:
joblib.dump(voteClassifier, "model.pkl")

['model.pkl']

In [93]:
import json

with open("allWords.json", "w") as f:
    json.dump({"allWords": wordFeatures}, f)