## Compare the DistilBERT model performance with other models such as TF-IDF and Word2Vec

### Load the dataset and rename columns

In [39]:
from datasets import load_dataset
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re


dataset = load_dataset("imdb")

df_train = pd.DataFrame(dataset['train']) # Use the train split for training
df_test = pd.DataFrame(dataset['test']) # Use the test split for evaluation

df_train = df_train[['text', 'label']] # Select only the text and label columns
df_train.columns = ['review', 'sentiment'] # Rename columns to match the required format

df_test = df_test[['text', 'label']] # Select only the text and label columns
df_test.columns = ['review', 'sentiment'] # Rename columns to match the required format

### Tokenize using regular expression and nltk stopwords

In [40]:
nltk.download("punkt")
nltk.download("stopwords")

stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text) # Remove punctuation
    tokens = word_tokenize(text) # Tokenize the text
    tokens = [word for word in tokens if word not in stop_words] # Remove stop words
    return tokens, ' '.join(tokens)

df_train['tokens'], df_train['cleaned_text'] = zip(*df_train['review'].apply(preprocess_text))
df_test['tokens'], df_test['cleaned_text'] = zip(*df_test['review'].apply(preprocess_text))


[nltk_data] Downloading package punkt to /Users/ashish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ashish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:

df_train.head()

Unnamed: 0,review,sentiment,tokens,cleaned_text
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,"[rented, curious, yellow, video, store, contro...",rented curious yellow video store controversy ...
1,"""I Am Curious: Yellow"" is a risible and preten...",0,"[curious, yellow, risible, pretentious, steami...",curious yellow risible pretentious steaming pi...
2,If only to avoid making this type of film in t...,0,"[avoid, making, type, film, future, film, inte...",avoid making type film future film interesting...
3,This film was probably inspired by Godard's Ma...,0,"[film, probably, inspired, godard, masculin, f...",film probably inspired godard masculin féminin...
4,"Oh, brother...after hearing about this ridicul...",0,"[oh, brother, hearing, ridiculous, film, umpte...",oh brother hearing ridiculous film umpteen yea...


In [42]:

df_test.head()

Unnamed: 0,review,sentiment,tokens,cleaned_text
0,I love sci-fi and am willing to put up with a ...,0,"[love, sci, fi, willing, put, lot, sci, fi, mo...",love sci fi willing put lot sci fi movies tv u...
1,"Worth the entertainment value of a rental, esp...",0,"[worth, entertainment, value, rental, especial...",worth entertainment value rental especially li...
2,its a totally average film with a few semi-alr...,0,"[totally, average, film, semi, alright, action...",totally average film semi alright action seque...
3,STAR RATING: ***** Saturday Night **** Friday ...,0,"[star, rating, saturday, night, friday, night,...",star rating saturday night friday night friday...
4,"First off let me say, If you haven't enjoyed a...",0,"[first, let, say, enjoyed, van, damme, movie, ...",first let say enjoyed van damme movie since bl...


### Setup TF-IDF Vectorizer for the dataset

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=10000)

X_tfidf_train = tfidf_vectorizer.fit_transform(df_train['cleaned_text'])
X_tfidf_test = tfidf_vectorizer.transform(df_test['cleaned_text'])

y_train = df_train['sentiment']
y_test = df_test['sentiment']

### Setup Word2Vec Embedding Model for the dataset

In [44]:
from gensim.models import Word2Vec
import numpy as np

w2v_model = Word2Vec(sentences=df_train['tokens'], vector_size=100, window=5, min_count=1, workers=4)

def get_w2v_embedding(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

# Create embeddings for train and test
X_w2v_train = np.array([get_w2v_embedding(tokens, w2v_model, 100) for tokens in df_train['tokens']])
X_w2v_test = np.array([get_w2v_embedding(tokens, w2v_model, 100) for tokens in df_test['tokens']])

### Train the TF-IDF and Word2Vec Model and print their accuracy on the test dataset

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# TF-IDF Model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_tfidf_train, y_train)
y_pred_tfidf = clf_tfidf.predict(X_tfidf_test)
print("TF-IDF Model Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf))

# Word2Vec Model
clf_w2v = LogisticRegression(max_iter=1000)
clf_w2v.fit(X_w2v_train, y_train)
y_pred_w2v = clf_w2v.predict(X_w2v_test)
print("Word2Vec Model Accuracy:", accuracy_score(y_test, y_pred_w2v))
print("Classification Report (Word2Vec):\n", classification_report(y_test, y_pred_w2v))

TF-IDF Model Accuracy: 0.88296
Classification Report (TF-IDF):
               precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

Word2Vec Model Accuracy: 0.81048
Classification Report (Word2Vec):
               precision    recall  f1-score   support

           0       0.81      0.81      0.81     12500
           1       0.81      0.81      0.81     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

