# Dependecies

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
# collecting data
df=pd.read_csv('movie_review.csv')

In [3]:
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [4]:
# checking for null values
df.isna().sum()

fold_id    0
cv_tag     0
html_id    0
sent_id    0
text       0
tag        0
dtype: int64

In [5]:
df_req=df.drop(['fold_id','cv_tag','html_id','sent_id'],axis=1)

In [6]:
df_req.head()

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos


In [7]:
# converting tag into 0/1
le=LabelEncoder()

In [8]:
df_req['tag']=pd.DataFrame(le.fit_transform(df_req['tag']))

In [9]:
df_req.head()

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,1
1,"for starters , it was created by alan moore ( ...",1
2,to say moore and campbell thoroughly researche...,1
3,"the book ( or "" graphic novel , "" if you will ...",1
4,"in other words , don't dismiss this film becau...",1


In [10]:
# data is balanced
df_req['tag'].value_counts()

1    32937
0    31783
Name: tag, dtype: int64

## Dependecies for preprocessing

In [14]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
stopwords_list=stopwords.words('english')
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [16]:
# not so important symbols
wl=list(string.punctuation)

In [17]:
wl.append(stopwords_list)
print(wl)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when

# Preprocessing

In [18]:
def preprocess(texts):
    result_texts = []
    for text in texts:
        words = text.split()
        filtered_words = [word for word in words if word not in wl]
        result_texts.append(' '.join(filtered_words))
    return result_texts


In [19]:
df_req['text']=pd.DataFrame(preprocess(df_req['text']))

# Stemming

In [20]:
from nltk.stem import PorterStemmer

In [21]:
ps=PorterStemmer()

In [22]:
def to_root_word(text):
    result=[]
    for line in text:
        result.append(" ".join(ps.stem(word) for word in line.split()))
    return result

In [23]:
df_req['text']=pd.DataFrame(to_root_word(df_req['text']))

In [24]:
X=df_req['text']
y=df_req['tag']

### Splitting Data into train test split

In [25]:
X_train,X_test2,y_train,y_test2=train_test_split(X,y,test_size=0.3,random_state=42)
X_cv,X_test,y_cv,y_test=train_test_split(X_test2,y_test2,test_size=0.1,random_state=42)

In [26]:
print(X_train.shape)
print(X_cv.shape)
print(X_test.shape)

(45304,)
(17474,)
(1942,)


## Bag of words

In [27]:
cv=CountVectorizer()

In [28]:
X_train_cv=cv.fit_transform(X_train)

In [29]:
X_train_cv

<45304x25074 sparse matrix of type '<class 'numpy.int64'>'
	with 801059 stored elements in Compressed Sparse Row format>

In [30]:
X_cv_cv=cv.transform(X_cv)
X_test_cv=cv.transform(X_test)

In [31]:
dir(cv)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'bui

In [32]:
cv.vocabulary_

{'even': 7501,
 'the': 22132,
 'intellectu': 11329,
 'know': 12293,
 'don': 6471,
 'bother': 2911,
 'with': 24613,
 'thi': 22190,
 'kind': 12193,
 'of': 15588,
 'pointless': 16895,
 'and': 1202,
 'pretenti': 17212,
 'yammer': 24846,
 'togeth': 22439,
 'other': 15825,
 'young': 24930,
 'director': 6183,
 'hi': 10299,
 'new': 15135,
 'hollywood': 10468,
 'gener': 8953,
 'like': 12902,
 'kauffman': 12031,
 'carpent': 3712,
 'hill': 10340,
 'milliu': 14276,
 'he': 10070,
 'exploit': 7651,
 'great': 9486,
 'creativ': 5189,
 'freedom': 8588,
 '1970': 151,
 'when': 24399,
 'mainstream': 13462,
 'produc': 17311,
 'dare': 5564,
 'to': 22423,
 'experi': 7634,
 'their': 22143,
 'song': 20576,
 'are': 1470,
 'alright': 1059,
 'but': 3417,
 'they': 22189,
 'play': 16803,
 'lifeless': 12875,
 'adapt': 726,
 'music': 14861,
 'video': 23871,
 'thing': 22202,
 'chang': 3944,
 'devis': 6027,
 'cun': 5382,
 'plan': 16777,
 'while': 24417,
 'do': 6404,
 'it': 11530,
 'stumbl': 21318,
 'on': 15663,
 'sympa

## Building model

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [34]:
model1=LogisticRegression(max_iter=1000)

In [35]:
model1.fit(X_train_cv,y_train)

In [36]:
pred_cv=model1.predict(X_cv_cv)

### Cross-Validation Score for model1

In [37]:
print(classification_report(pred_cv,y_cv))

              precision    recall  f1-score   support

           0       0.70      0.68      0.69      8733
           1       0.69      0.70      0.70      8741

    accuracy                           0.69     17474
   macro avg       0.69      0.69      0.69     17474
weighted avg       0.69      0.69      0.69     17474



In [38]:
#  Example test case
ex=["films adapted from comic batman superman spawn or geared toward kids casper or the arthouse crowd ghost world but there's never really been a comic book like from hell before","The Movie was great","The movie called Twilight hurted some emotions of people and the ending was not liked by many"]
ex=preprocess(ex)
ex_cv=cv.transform(ex)
model1.predict(ex_cv)

array([0, 0, 1])

### Test Score for model1

In [39]:
pred_test=model1.predict(X_test_cv)
print(classification_report(pred_test,y_test))

              precision    recall  f1-score   support

           0       0.69      0.72      0.70       968
           1       0.71      0.67      0.69       974

    accuracy                           0.70      1942
   macro avg       0.70      0.70      0.70      1942
weighted avg       0.70      0.70      0.70      1942



In [40]:
from sklearn.naive_bayes import MultinomialNB

In [41]:
model2=MultinomialNB()

In [42]:
model2.fit(X_train_cv,y_train)

### Cross-Validation Score for model2

In [43]:
p_cv=model2.predict(X_cv_cv)
print(classification_report(p_cv,y_cv))

              precision    recall  f1-score   support

           0       0.71      0.69      0.70      8814
           1       0.70      0.72      0.71      8660

    accuracy                           0.71     17474
   macro avg       0.71      0.71      0.71     17474
weighted avg       0.71      0.71      0.71     17474



### Test Score for model2

In [44]:
p_test=model2.predict(X_test_cv)
print(classification_report(p_test,y_test))

              precision    recall  f1-score   support

           0       0.69      0.72      0.70       975
           1       0.70      0.67      0.69       967

    accuracy                           0.70      1942
   macro avg       0.70      0.70      0.70      1942
weighted avg       0.70      0.70      0.70      1942



#### Example Testing

In [45]:
ex=["Incredible movie! The plot was engaging, and the performances were outstanding. I highly recommend it.",
"A masterpiece! The cinematography and soundtrack were phenomenal. I can't wait to watch it again.",
"Absolutely loved every minute of this film. The characters were relatable, and the story was heartwarming.",
"This movie is a gem! The humor, drama, and action were perfectly balanced. A definite must-see!",
"An emotional rollercoaster! The storytelling was powerful, and the ending left me with a sense of fulfillment.",
   "Disappointing movie. The plot was weak, and the acting felt forced. I expected more from such a hyped film.",
"Not worth the hype. The story lacked coherence, and the characters were forgettable. I left the theater unsatisfied.",
"Boring from start to finish. The film failed to engage me, and I found myself checking the time constantly.",
"Poorly executed. The dialogue was cringe-worthy, and the plot twists felt forced. I wouldn't recommend it.",
"Regret watching this one. The storytelling was confusing, and the ending was unsatisfying. Save your time and skip it."]
ex=pd.DataFrame(ex,columns=['text'])
ex['text']=pd.DataFrame(preprocess(ex['text']))
ex['text']=pd.DataFrame(to_root_word(ex['text']))
ex_cv=cv.transform(ex['text'])

In [46]:
model2.predict(ex_cv)

array([1, 1, 0, 1, 1, 0, 0, 0, 0, 0])

In [47]:
model1.predict(ex_cv)

array([1, 1, 0, 1, 1, 0, 0, 0, 0, 0])