# Import Libraries

In [60]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

import itertools

#Ignores warning messages.
import warnings
warnings.filterwarnings('ignore')

# Fit

In [61]:
phrases = [" The quick brow fox jumped over the lazy dog",
           "education is what you have left over after forgetting everything"]

In [62]:
vect = CountVectorizer()
vect.fit(phrases)

In [63]:
print(f"Vocabulary size : {len(vect.vocabulary_)}")
print(f"Vocabulary content : {vect.vocabulary_}")

Vocabulary size : 17
Vocabulary content : {'the': 14, 'quick': 13, 'brow': 1, 'fox': 6, 'jumped': 9, 'over': 12, 'lazy': 10, 'dog': 2, 'education': 3, 'is': 8, 'what': 15, 'you': 16, 'have': 7, 'left': 11, 'after': 0, 'forgetting': 5, 'everything': 4}


In [64]:
bag_of_words = vect.transform(phrases)

In [65]:
### Index  ### Its repetition
print(bag_of_words)

  (0, 1)	1
  (0, 2)	1
  (0, 6)	1
  (0, 9)	1
  (0, 10)	1
  (0, 12)	1
  (0, 13)	1
  (0, 14)	2
  (1, 0)	1
  (1, 3)	1
  (1, 4)	1
  (1, 5)	1
  (1, 7)	1
  (1, 8)	1
  (1, 11)	1
  (1, 12)	1
  (1, 15)	1
  (1, 16)	1


In [66]:
vect.get_feature_names_out()

array(['after', 'brow', 'dog', 'education', 'everything', 'forgetting',
       'fox', 'have', 'is', 'jumped', 'lazy', 'left', 'over', 'quick',
       'the', 'what', 'you'], dtype=object)

# Applying it to IMDB Dataset

In [67]:
data = pd.read_csv("labeledTrainData.csv",delimiter="\t")

In [68]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [69]:
data['sentiment'].value_counts()

sentiment
1    12500
0    12500
Name: count, dtype: int64

# Splitting

## Vectorize

In [70]:
X = vect.fit_transform(data['review'])
y = data['sentiment']

In [71]:
train_x,test_x,train_y,test_y = train_test_split(X,y,test_size=0.2 ,random_state = 11)

In [72]:
test_x

<5000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 694658 stored elements in Compressed Sparse Row format>

# Vectorize

In [73]:
feature_names = vect.get_feature_names_out()

In [74]:
dict(itertools.islice(vect.vocabulary_.items(),20))

{'with': 73342,
 'all': 2662,
 'this': 66562,
 'stuff': 63783,
 'going': 27963,
 'down': 19854,
 'at': 4753,
 'the': 66339,
 'moment': 43526,
 'mj': 43300,
 've': 70920,
 'started': 62903,
 'listening': 38991,
 'to': 67125,
 'his': 31095,
 'music': 44529,
 'watching': 72259,
 'odd': 46634,
 'documentary': 19380,
 'here': 30670}

## Logistic Regression

In [75]:
scores = cross_val_score(LogisticRegression(), train_x, train_y, cv = 5)
print(f"Mean Cross-Validation accuracy {round(np.mean(scores),5)}")

Mean Cross-Validation accuracy 0.8751


In [76]:
logreg = LogisticRegression()
logreg.fit(train_x,train_y)
print(f"Training accuracy score : {logreg.score(train_x,train_y)}")
print(f"Testing accuracy score : {logreg.score(test_x,test_y)}")

Training accuracy score : 0.9777
Testing accuracy score : 0.8806


# Confusion Matrix

In [78]:
pred_logreg = logreg.predict(test_x)
confusion = confusion_matrix(test_y,pred_logreg)
print(confusion)

[[2199  333]
 [ 264 2204]]


# Multinominal Naive Bayes

In [79]:
nb = MultinomialNB()
nb.fit(train_x,train_y)

print(f"Training Score : {nb.score(train_x,train_y)}")
print(f"Testing Score : {nb.score(test_x,test_y)}")


Training Score : 0.90255
Testing Score : 0.8598
