In [1]:
# pnadas to read data frames
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np 

# for plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


# for NLP
import re #regular expressions
import nltk 
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')
import string

# for train test split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Supervised learning algorithms  
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# For algorithm evalution
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mehbo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# read data file
movie_df = pd.read_csv('IMDBDataset.csv')

In [3]:
# remove duplicates
movie_df['dup'] = movie_df.duplicated(subset=None, keep='first')
del movie_df['dup']

In [4]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Initialize stopwords and lemmatizer
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# Define the preprocessing function
def preprocess(text):
    
    text = text.lower() # Convert to lowercase
    
    # Replace everything except a-z, A-Z, ".", "?", "!", "," with space
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)

    # Remove URLs
    text = re.sub(r"http\S+", "", text)

    # Remove HTML tags
    html = re.compile(r'<.*?>')
    text = html.sub(r'', text)

    # Remove punctuations
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^,' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p, '')

    # Remove stopwords
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    # Lemmatize the words and remove those in the custom stopwords list (if any)
    text = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(text) # Joining back into a string

    # Remove emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) # Remove emojis
    
    return text


In [5]:
# Apply preprocessing to review column
movie_df['prep'] = movie_df['review'].apply(lambda x: preprocess(x)) 
del movie_df['review'] # remove review column


In [6]:
# Tokenize the preprocessed text
movie_df['tokens'] = movie_df['prep'].apply(lambda x: re.findall(r"[\w']+", x))

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Assuming movie_df['prep'] contains preprocessed text and movie_df['sentiment'] the labels
X_train, X_test, y_train, y_test = train_test_split(
    movie_df['prep'].values, 
    movie_df['sentiment'].values, 
    test_size=0.2, 
    random_state=42, 
    stratify=movie_df['sentiment'].values
)


In [8]:
count_vectorizer = CountVectorizer()
bow_train_vectors = count_vectorizer.fit_transform(X_train)
bow_test_vectors = count_vectorizer.transform(X_test)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# BoW
classifier_bow = LogisticRegression()
classifier_bow.fit(bow_train_vectors, y_train)
y_pred_bow = classifier_bow.predict(bow_test_vectors)
print(classification_report(y_test, y_pred_bow))
print("Accuracy with BoW:", accuracy_score(y_test, y_pred_bow))

              precision    recall  f1-score   support

    negative       0.89      0.88      0.89      5000
    positive       0.88      0.89      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Accuracy with BoW: 0.8877


In [10]:
from sklearn.naive_bayes import MultinomialNB

classifier_bow = MultinomialNB()
classifier_bow.fit(bow_train_vectors, y_train)
y_pred_bow = classifier_bow.predict(bow_test_vectors)
print(classification_report(y_test, y_pred_bow))
print("Accuracy with BoW:", accuracy_score(y_test, y_pred_bow))


              precision    recall  f1-score   support

    negative       0.85      0.88      0.86      5000
    positive       0.88      0.84      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Accuracy with BoW: 0.8617


In [11]:
from sklearn.neighbors import KNeighborsClassifier

classifier_bow = KNeighborsClassifier()
classifier_bow.fit(bow_train_vectors, y_train)
y_pred_bow = classifier_bow.predict(bow_test_vectors)
print(classification_report(y_test, y_pred_bow))
print("Accuracy with BoW:", accuracy_score(y_test, y_pred_bow))

              precision    recall  f1-score   support

    negative       0.59      0.68      0.63      5000
    positive       0.63      0.54      0.58      5000

    accuracy                           0.61     10000
   macro avg       0.61      0.61      0.61     10000
weighted avg       0.61      0.61      0.61     10000

Accuracy with BoW: 0.6086


In [12]:
from sklearn.tree import DecisionTreeClassifier

classifier_bow = DecisionTreeClassifier()
classifier_bow.fit(bow_train_vectors, y_train)
y_pred_bow = classifier_bow.predict(bow_test_vectors)
print(classification_report(y_test, y_pred_bow))
print("Accuracy with BoW:", accuracy_score(y_test, y_pred_bow))

              precision    recall  f1-score   support

    negative       0.73      0.73      0.73      5000
    positive       0.73      0.73      0.73      5000

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000

Accuracy with BoW: 0.7305


In [13]:
from sklearn.ensemble import RandomForestClassifier

classifier_bow = RandomForestClassifier()
classifier_bow.fit(bow_train_vectors, y_train)
y_pred_bow = classifier_bow.predict(bow_test_vectors)
print(classification_report(y_test, y_pred_bow))
print("Accuracy with BoW:", accuracy_score(y_test, y_pred_bow))


              precision    recall  f1-score   support

    negative       0.86      0.86      0.86      5000
    positive       0.86      0.86      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Accuracy with BoW: 0.8613
