In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,accuracy_score

In [20]:
imbd_movie_review = pd.read_csv("/Volumes/DriveB/NLP_Learning/Movie Review Prediction/IMDB Dataset.csv")
imbd_movie_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [21]:
imbd_movie_review['sentiment'] = imbd_movie_review['sentiment'].map({'negative':0,'positive': 1})
imbd_movie_review['sentiment'] 

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [22]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Downloading necessary nltk data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


#Initializing thre lemitizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mahjabeenmohiuddin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mahjabeenmohiuddin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mahjabeenmohiuddin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
def clean_text(text): 
    if isinstance(text, str): 
      text = text.lower()
      text = re.sub(pattern = '[^a-zA-Z]', repl = ' ', string =text)
      text = re.sub(pattern= '\s+', repl = ' ', string= text).strip()
      text = re.sub(r'[  \t]+$', '', text)  
      cleaned_tokens = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

      cleaned_text = ' '.join(cleaned_tokens)
      return cleaned_text
    else:
       return "" 

In [24]:
imbd_movie_review['cleaned_review'] = imbd_movie_review['review'].apply(clean_text)


print(imbd_movie_review[['review', 'cleaned_review']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one reviewer mentioned watching oz episode hoo...  
1  wonderful little production br br filming tech...  
2  thought wonderful way spend time hot summer we...  
3  basically family little boy jake think zombie ...  
4  petter mattei love time money visually stunnin...  


In [26]:
imbd_movie_review.isna().sum()

review            0
sentiment         0
cleaned_review    0
dtype: int64

In [27]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(imbd_movie_review['cleaned_review'])
X

<50000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 3986304 stored elements in Compressed Sparse Row format>

In [28]:
y = imbd_movie_review['sentiment']
y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

#### Splitting the data into train and test sets

In [29]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [30]:
print("The dimension of X_train is:", X_train.shape)
print("The dimension of X_test is:", X_test.shape)
print("The dimension of y_train is:", y_train.shape)
print("The dimension of y_test is:", y_test.shape)

The dimension of X_train is: (40000, 5000)
The dimension of X_test is: (10000, 5000)
The dimension of y_train is: (40000,)
The dimension of y_test is: (10000,)


### Instantiation of class and Training model

In [31]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

### Evaluating Model

In [32]:
train_predict = rfc.predict(X_train)
print("The accuracy score, precision, recall,f1-score,  and support scores of training set are:\n")
print(f"The accuracy score is:, {accuracy_score(y_train_, train_predict_):.4f}")
print(f"{classification_report(y_train_, train_predict_, target_names=['positive', 'negative'])}")

The accuracy score, precision, recall,f1-score,  and support scores of training set are:

The accuracy score is:, 1.0000
              precision    recall  f1-score   support

    positive       1.00      1.00      1.00     20039
    negative       1.00      1.00      1.00     19961

    accuracy                           1.00     40000
   macro avg       1.00      1.00      1.00     40000
weighted avg       1.00      1.00      1.00     40000



In [33]:
test_predict = rfc.predict(X_test)
print("The accuracy score, precision, recall, f1-score, and support scores of test set are:\n")
print(f"The accuracy score is:, {accuracy_score(y_test_,test_predict_):.4f}")
print(f"{classification_report(y_test_, test_predict_, target_names = ['Positive', 'Negative'])}")

The accuracy score, precision, recall, f1-score, and support scores of test set are:

The accuracy score is:, 0.8524
              precision    recall  f1-score   support

    Positive       0.84      0.86      0.85      4961
    Negative       0.86      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



# Result of Trained model is that model is overfitting

### Testing the model on prediction of sample revies

In [34]:
sample_indices = np.random.choice(X_test.shape[0], 5, replace=False)


sample_reviews = X_test[sample_indices]
sample_true_labels = y_test.iloc[sample_indices]

sample_reviews_cleaned = [" ".join(vectorizer.inverse_transform(sample_reviews[i])[0]) for i in range(sample_reviews.shape[0])]
sample_reviews_vectorized = vectorizer.transform(sample_reviews_cleaned)

sample_predictions = rfc.predict(sample_reviews_vectorized)

for idx, (index, review, true_label, predicted_label) in enumerate(zip(sample_indices, sample_reviews_cleaned, sample_true_labels, sample_predictions)):
    print(f"Sample {idx+1}:")
    print(f"Review: {review}")
    print(f"True Label: {'Positive' if true_label == 1 else 'Negative'}")
    print(f"Predicted Label: {'Positive' if predicted_label == 1 else 'Negative'}")
    print(f"Match: {true_label == predicted_label}")
    print()

Sample 1:
Review: gifted succeeds demon imagined cruel ben miserable theatre necessarily joan ambitious overly sleazy released producer often obnoxious flaw supporting husband figure obviously throughout edge without opening alone unlike directed drunk tough fascinating want part actress night although john especially someone except could quite take self secret long early try least also almost let sure role eye performance last good stage director seems stunning playing real film make fighting woman love fully character dialogue way life see truly actor give time production well developed around would never go word
True Label: Positive
Predicted Label: Positive
Match: True

Sample 2:
Review: needle remotely contemporary passionate interested martin american moment general century period genre masterpiece hour summary dull anyone cinema running perspective huge series eye seen best director film even rather time say never many br
True Label: Positive
Predicted Label: Negative
Match: Fal