<a href="https://colab.research.google.com/github/Natebruh1/SentimentAnalysis/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
import nltk
import seaborn as sns
import sklearn
import pickle
import re
import csv
from enum import Enum
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics
df=pd.read_csv("/content/drive/MyDrive/Content/IMDB Dataset.csv",on_bad_lines='skip') #Movie Review Dataframe

In [None]:


#---NLTK DOWNLOADS---
nltk.download('stopwords')
#print(stopwords.words('english'))
nltk.download('wordnet')



#DATAFRAME WE ARE USING
df=pd.read_csv("/content/drive/MyDrive/Content/IMDB Dataset.csv",on_bad_lines='skip') #Movie Review Dataframe
#print(df.head())
#print(df.isnull().sum())



#---ENUMS---
class WordType(Enum):
    verb=1
    noun=2
class DependencyParseTag(Enum):
    verb=1
    subject=2
    obj=3



#---Classes---

#Sentences
class SentenceObject:
    sentenceData=""
    sentenceTokens=[] #Array of WordToken Objects
    sentenceBigrams=[] #See how natural a sentence is
    def __init__(self, sentence): #Contructor
        self.sentenceData=sentence
    def tokeniseSentence(self): #Convert our sentence Data to Tokens
        for i in self.sentenceData.split():
            self.sentenceTokens.append(WordToken(i))


    def __getitem__(self, key): #Get the specified element from the key of the sentence
        return self.sentenceTokens[key]


    def removeStopWords(self):
        for each in self.sentenceTokens:
            tempStr=str(each) #Convert the word token to a string
            if tempStr in stopwords.words('english'): #If the word is in our list of stop words

                self.sentenceTokens.remove(each) #Remove it from the list
    def __len__(self):
        return len(self.sentenceTokens)
    def __contains__(self,key):
        return key in self.sentenceTokens


#Word Tokens

class WordToken:
    tokenData=""
    def __init__(self,newToken): #Contructor
        self.tokenData=newToken
    def __str__(self): #Allows us to represent the object as a string (For printing)
        return self.tokenData

    def StemToken(self):
        self.tokenData=nltk.stem.PorterStemmer().stem(self.tokenData) #Stem this token using PorterStemmer algorithm
        #Algorithm imported from nltk library
    def LemmatizeToken(self):
        self.tokenData=nltk.stem.WordNetLemmatizer().lemmatize(self.tokenData) #Lemmatize this using the nltk library lemmatizer




#Dependency Parse Tree
# -RootToken
# Subject[]
# Object[]














def main():
    #Encode the sentiment of the dataframe
    label=sklearn.preprocessing.LabelEncoder()
    df["sentiment"] = label.fit_transform(df["sentiment"])
    print(df.head())
    print(df.shape)
    independentData=df["review"]
    ##OOP APPROACH - used for testing but useful for determining more sentiments
    """sentenceObjArray=[]

    for i in range(len(independentData)):
        sentenceObjArray.append(SentenceObject(independentData[i]))
        sentenceObjArray[i].tokeniseSentence()
        for j in range(len(sentenceObjArray[i])):

            sentenceObjArray[i][j].StemToken()
            sentenceObjArray[i][j].LemmatizeToken()
        sentenceObjArray[i].removeStopWords()"""

    sentimentTable=df['sentiment']


    ps=nltk.stem.PorterStemmer()
    finalCorpus=[]
    for i in range(len(independentData)):
        print(i)
        review=re.sub("[^a-zA-Z]", " ", independentData[i])
        review=review.lower()
        review=[ps.stem(word) for word in review if word not in set(stopwords.words("english"))] #Stem the word if its not in the stopwords
        review ="".join(review)
        finalCorpus.append(review) # Add review to Final Corpus


    pickle.dump(finalCorpus, open("/content/drive/MyDrive/Content/corpusData.pkl","wb"))

    """testSentence="The Quick Brown Fox Jumped Over The Corpora Lazy Dog"

    testSentObj=SentenceObject(testSentence)
    #Test Stemming
    testSentObj.tokeniseSentence()
    testSentObj[4].StemToken()
    print(testSentObj[4])

    #Test Lemmatizing
    testSentObj[8].LemmatizeToken()
    print(testSentObj[8])

    #Remove sentence stop words
    testSentObj.removeStopWords()
    print(testSentObj[0])"""
main()


In [15]:
from sklearn.model_selection import train_test_split
#Now we need to turn the text into vectors
finalCorpus=pickle.load(open("/content/drive/MyDrive/Content/corpusData.pkl","rb"))
cv=TfidfVectorizer(max_features=5000)
#for i in range(500):
#  print(finalCorpus[i])
vectorizedMatrix=cv.fit_transform(finalCorpus).toarray()

print(vectorizedMatrix.shape)
label=sklearn.preprocessing.LabelEncoder()
df["sentiment"] = label.fit_transform(df["sentiment"])
sentimentTable=df['sentiment']
vectorizedMatrix_trainingData, vectorizedMatrix_testingData, sentimentTable_trainingData, sentimentTable_testingData = train_test_split(vectorizedMatrix,sentimentTable, test_size=0.1, random_state=101)
print(vectorizedMatrix_trainingData.shape, vectorizedMatrix_testingData.shape, sentimentTable_trainingData.shape, sentimentTable_testingData.shape)


#Use Multinomial naive-bayes model
mnb=MultinomialNB()
#Train the model
mnb.fit(vectorizedMatrix_trainingData,sentimentTable_trainingData)

#Create Prediction
pred=mnb.predict(vectorizedMatrix_testingData)
print(metrics.accuracy_score(sentimentTable_testingData,pred))
print(metrics.confusion_matrix(sentimentTable_testingData,pred))
print(metrics.classification_report(sentimentTable_testingData,pred))
pd.DataFrame(np.c_[sentimentTable_testingData,pred],columns=["Actual","Predicted"])
#Save Prediction
pickle.dump(cv, open("/content/drive/MyDrive/Content/count-vectorizer.pkl","wb"))
pickle.dump(mnb, open("/content/drive/MyDrive/Content/multinomial-naive-bayes.pkl","wb"))

(50000, 5000)
(45000, 5000) (5000, 5000) (45000,) (5000,)
0.8418
[[2061  325]
 [ 466 2148]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84      2386
           1       0.87      0.82      0.84      2614

    accuracy                           0.84      5000
   macro avg       0.84      0.84      0.84      5000
weighted avg       0.84      0.84      0.84      5000



In [None]:
loaded_cv=pickle.load(open("/content/drive/MyDrive/Content/count-vectorizer.pkl","rb"))
loaded_model=pickle.load(open("/content/drive/MyDrive/Content/multinomial-naive-bayes.pkl","rb"))

def test_sentence(inp):
  sen=loaded_cv.transform([inp]).toarray()
  res = loaded_model.predict(sen)[0]
  if res==1:
    return True
  else:
    return False

print(test_sentence("Good movie great job"))





False
