<a href="https://colab.research.google.com/github/HjayejMohamed/NLP/blob/main/imdb_movie_reviews_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [188]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from string import punctuation

In [232]:
df_imdb=pd.read_table('/content/imdb_labelled.txt')

In [233]:
df_imdb.head()

Unnamed: 0,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",0
0,Not sure who was more lost - the flat characte...,0
1,Attempting artiness with black & white and cle...,0
2,Very little music or anything to speak of.,0
3,The best scene in the movie was when Gerardo i...,1
4,"The rest of the movie lacks art, charm, meanin...",0


In [261]:
df_imdb.shape

(747, 2)

In [236]:
df_imdb.columns=['message','target']

In [237]:
df_imdb.head()

Unnamed: 0,message,target
0,Not sure who was more lost - the flat characte...,0
1,Attempting artiness with black & white and cle...,0
2,Very little music or anything to speak of.,0
3,The best scene in the movie was when Gerardo i...,1
4,"The rest of the movie lacks art, charm, meanin...",0


In [238]:
df_imdb.isna().sum()

message    0
target     0
dtype: int64

# I.Data Cleaning:

I.1 Removing Stopwords
I.2 Removing Punctuations:

In [239]:
parser=English()
nlp=spacy.load('en_core_web_sm')
stopwords=list(STOP_WORDS)
punctuations=punctuation

In [240]:
def cleaner(text):
  words=parser(nlp(text))
  words=[word.lemma_.lower().strip() for word in words if word.is_stop==False and word.is_punct==False and word.is_space==False]
  return words

In [241]:
words=clean(df_imdb.message[0])
words

['sure', 'lose', 'flat', 'character', 'audience', 'nearly', 'half', 'walk']

# Machine learning with SKlearn

In [262]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [263]:
vectorizer = CountVectorizer(tokenizer = cleaner, ngram_range=(1,1))
vectorizer2 = TfidfVectorizer(tokenizer = cleaner) 
classifier = LinearSVC()

In [264]:
corpus = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
    ]
x=vectorizer2.fit_transform(list('This is the first document.'))
x.toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0.,

In [272]:
pipe=Pipeline([
  ('vectorizer',vectorizer2),
  ('classifier', classifier)
])

## Splitting

In [246]:
# Features and Labels
X = df_imdb['message']
ylabels = df_imdb['target']

In [247]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [280]:
X_train.shape

(597,)

## Training

In [273]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function cleaner at 0x7f5ed2e71a70>)),
                ('classifier', LinearSVC())])

## Evaluation

In [274]:
# Predicting with a test dataset
sample_prediction = pipe.predict(X_test)

In [275]:
# Accuracy
print("Accuracy Train: ",pipe.score(X_train,y_train))
print("Accuracy Test: ",pipe.score(X_test,y_test))

Accuracy Train:  0.9899497487437185
Accuracy Test:  0.72


In [276]:
# Another random review
pipe.predict(["This was a great movie"])

array([1])

In [277]:
pipe.predict(["A very, very, very slow-moving, aimless movie about a distressed, drifting young man."])

array([0])

In [278]:
#remove "word=slow"
pipe.predict(["A very, very, very -moving, aimless movie about a distressed, drifting young man."])

array([1])

In [279]:
print(classification_report(y_test,sample_prediction))

              precision    recall  f1-score   support

           0       0.81      0.60      0.69        78
           1       0.66      0.85      0.74        72

    accuracy                           0.72       150
   macro avg       0.74      0.72      0.72       150
weighted avg       0.74      0.72      0.72       150



# Training 2

In [266]:
BC=BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy'))

In [267]:
pipe2=Pipeline([
  ('vectorizer',vectorizer2),
  ('classifier', BC)
])

In [268]:
pipe2.fit(X_train,y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function cleaner at 0x7f5ed2e71a70>)),
                ('classifier',
                 BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy')))])

In [269]:
# Predicting with a test dataset
sample_prediction2 = pipe2.predict(X_test)

In [270]:
# Accuracy
print("Accuracy Train: ",pipe2.score(X_train,y_train))
print("Accuracy Test: ",pipe2.score(X_test,y_test))

Accuracy Train:  0.9865996649916248
Accuracy Test:  0.6933333333333334


In [271]:
print(classification_report(y_test,sample_prediction2))

              precision    recall  f1-score   support

           0       0.77      0.59      0.67        78
           1       0.64      0.81      0.72        72

    accuracy                           0.69       150
   macro avg       0.71      0.70      0.69       150
weighted avg       0.71      0.69      0.69       150

