### Extract movie reviews for any movie from IMDB and perform sentimental analysis

#### Importing Libearies 

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import spacy
import pickle
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin 
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import csv
import string

#### Importing Dataset

In [None]:
dataset = pd.read_csv('/content/IMDB Dataset.csv',encoding='ISO-8859-1') 

In [None]:
dataset.head() 

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
print(f'Rows: {dataset.shape[1]}\nColumns: {dataset.shape[0]}') 

Rows: 2
Columns: 50000


In [None]:
print(f'Columns Names: {list(dataset.columns)}')

Columns Names: ['review', 'sentiment']


#### Text Operation

In [None]:
nlp = English()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [None]:
def tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

#### Transformation and Vectorization

In [None]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

#### Basic function to clean the text 

In [None]:
def clean_text(text):     
    return text.strip().lower()

In [None]:
vectorizer = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,1)) 
tfvectorizer = TfidfVectorizer(tokenizer = tokenizer)

#### Split the Dataset

In [None]:
X = dataset['review']
y = dataset['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

#### Logistic Regression

In [None]:
classifier = LogisticRegression()
LRmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

#### Train the Model

In [None]:
LRmodel.fit(X_train,y_train)   


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### Predict the test set

In [None]:
LRpred = LRmodel.predict(X_test)

#### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test,LRpred)

In [None]:
cm

array([[4458,  593],
       [ 501, 4448]])

#### Accuracy Score 

In [None]:
ac = accuracy_score(y_test,LRpred) 

In [None]:
ac 

0.8906

#### Classification Report

In [None]:
cr = classification_report(y_test,LRpred) 

In [None]:
cr

'              precision    recall  f1-score   support\n\n    negative       0.90      0.88      0.89      5051\n    positive       0.88      0.90      0.89      4949\n\n    accuracy                           0.89     10000\n   macro avg       0.89      0.89      0.89     10000\nweighted avg       0.89      0.89      0.89     10000\n'

#### random review Test

In [None]:
pre = LRmodel.predict(["Production has an incredibly important place to shoot a series or film. Sometimes even a very minimalist story can reach an incredibly successful point after the right production stages. The Witcher series is far from minimalist. The Witcher is one of the best Middle-earth works in the world. Production quality is essential if you want to handle such a topic successfully."])
print(f'Prediction: {pre[0]}') 

Prediction: positive


### Random Forest

In [None]:
RFclassifier = RandomForestClassifier(n_estimators = 200)
RFmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', RFclassifier)])

#### Train the Model

In [None]:
RFmodel.fit(X_train,y_train)   

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7f2489bac9e8>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 t...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_im

#### Predict the test set

In [None]:
RFpred = RFmodel.predict(X_test)

#### Confusion Matrix 

In [None]:
cm = confusion_matrix(y_test,RFpred)

In [None]:
cm

array([[4360,  691],
       [ 607, 4342]])

#### Accuracy Score

In [None]:
ac = accuracy_score(y_test,RFpred) 

In [None]:
ac

0.8702

#### Classification Report

In [None]:
cr = classification_report(y_test,RFpred) 

In [None]:
cr

'              precision    recall  f1-score   support\n\n    negative       0.88      0.86      0.87      5051\n    positive       0.86      0.88      0.87      4949\n\n    accuracy                           0.87     10000\n   macro avg       0.87      0.87      0.87     10000\nweighted avg       0.87      0.87      0.87     10000\n'

#### random review test

In [None]:
pre = RFmodel.predict(["I think this is my first review. This series is so bad I had to write one. I don't understand the good score. I have tried on 2 separate occasions to watch this show. Haven't even gotten past the 2nd episode because it is SO BORING."])
print(f'Prediction: {pre[0]}')

Prediction: negative


### LinearSVC

In [None]:
SVCclassifier = LinearSVC()
SVCmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', SVCclassifier)])

#### Train the Model

In [None]:
SVCmodel.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7f2489067cc0>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenizer at 0x7f248c005510>,
                                 vocabulary=None)),
                ('classifier',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                      

#### Predict the test set Result

In [None]:
SVCpred = SVCmodel.predict(X_test)

#### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test,SVCpred)

In [None]:
cm

array([[4364,  687],
       [ 591, 4358]])

#### Accuracy atrix

In [None]:
ac = accuracy_score(y_test,SVCpred)

In [None]:
ac

0.8722

#### Classification Report

In [None]:
cr = classification_report(y_test,SVCpred) 

In [None]:
cr

'              precision    recall  f1-score   support\n\n    negative       0.88      0.86      0.87      5051\n    positive       0.86      0.88      0.87      4949\n\n    accuracy                           0.87     10000\n   macro avg       0.87      0.87      0.87     10000\nweighted avg       0.87      0.87      0.87     10000\n'

#### Random Review Test

In [None]:
pre = SVCmodel.predict(["Henry cavill nailed the role perfectly. The fight scenes, the music, the cinematography, the whole atmosphere is beyond amazing. Netflix did it again"])
print(f'Prediction: {pre[0]}')

Prediction: positive


#### Colclusion

The accuracy of all the algorithm is nearly the same and Logistic Regression accuracy is best among them with an accuracy of 88.97%