In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
import string
import re
import pickle

In [7]:
# load pretrained model
wv_pretrained = api.load("word2vec-google-news-300")



In [8]:
# find most similar words
wv_pretrained.most_similar(positive=["king", "woman"], negative=["man"])


[('queen', 0.7118193507194519),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377322435379028),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134939193726),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411403656006)]

In [10]:
wv_pretrained.most_similar(positive=["fruit", "banana"], negative=["red"])

[('mango', 0.6317152380943298),
 ('bananas', 0.6275610327720642),
 ('mangoes', 0.6114541888237),
 ('citrus_fruit', 0.59001624584198),
 ('fruits', 0.5772441625595093),
 ('avocados', 0.5595457553863525),
 ('Fruit', 0.5569567084312439),
 ('pineapples', 0.5541747808456421),
 ('Cavendish_bananas', 0.55269855260849),
 ('melon', 0.5473435521125793)]

In [19]:
wv_pretrained.most_similar(positive=["Paris", "France"], negative=["Germany"])

[('Parisian', 0.6453418731689453),
 ('French', 0.6274688839912415),
 ('Hopital_Europeen_Georges_Pompidou', 0.5950257778167725),
 ('Avignon', 0.5637456774711609),
 ('Grigny_south', 0.5540268421173096),
 ('Spyker_D##_Peking', 0.5531150698661804),
 ('de_France', 0.5530701875686646),
 ('Saint_Honoré', 0.5517325401306152),
 ('Picardie', 0.5504419207572937),
 ('Nanterre_west', 0.548758864402771)]

In [22]:
wv_pretrained.most_similar(positive=["man", "woman"], negative=["boy"])

[('lady', 0.5354641675949097),
 ('person', 0.5296355485916138),
 ('Woman', 0.513024628162384),
 ('men', 0.4956325590610504),
 ('policewoman', 0.4909152388572693),
 ('WOMAN', 0.4802447259426117),
 ('motorist', 0.47880926728248596),
 ('women', 0.47408223152160645),
 ('teenage_girl', 0.4722822904586792),
 ('businesswoman', 0.469870001077652)]

In [24]:
wv_pretrained.most_similar(positive=["happy", "happiness"], negative=["sad"])

[('contentment', 0.5909623503684998),
 ('satisfaction', 0.5197392106056213),
 ('happier', 0.47888270020484924),
 ('contented', 0.4626934230327606),
 ('joy', 0.45936357975006104),
 ('Happiness', 0.45890146493911743),
 ('Adobe_CS2_Premium', 0.45625054836273193),
 ('satisfied', 0.44240081310272217),
 ('prosocial_spending', 0.43790724873542786),
 ('marital_bliss', 0.4179442822933197)]

In [23]:
wv_pretrained.most_similar(positive=["walking", "walk"], negative=["swimming"])

[('walked', 0.6889522671699524),
 ('walks', 0.6313304901123047),
 ('Walking', 0.5557611584663391),
 ('stroll', 0.5340489149093628),
 ('strolls', 0.5130729675292969),
 ('wander', 0.4973868429660797),
 ('strolling', 0.48373162746429443),
 ('saunter', 0.48220333456993103),
 ('trudging', 0.46994322538375854),
 ('striding', 0.4681999683380127)]

In [25]:
# Load the dataset
df = pd.read_csv(r'D:\school stuff\SEM 7\movie_reviews.csv')





In [26]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [29]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None


In [28]:
print(df['sentiment'].value_counts())

positive    25000
negative    25000
Name: sentiment, dtype: int64


In [33]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens


In [34]:
# Apply text cleaning
df['cleaned_text'] = df['review'].apply(preprocess_text)

In [35]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['sentiment'], test_size=0.2, random_state=1)

In [36]:
# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [37]:
# Create a list of sentences for Word2Vec training
sentences = X_train.tolist()

In [40]:
# Train Skip-gram model
skipgram_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, sg=1)
skipgram_model.save("skipgram_model.model")
with open('skipgram_model.pkl', 'wb') as f:
    pickle.dump(skipgram_model, f)

In [41]:
# Train CBOW model
cbow_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, sg=0)
cbow_model.save("cbow_model.model")
with open('cbow_model.pkl', 'wb') as f:
    pickle.dump(cbow_model, f)

In [42]:
# Load pretrained Word2Vec model
pretrained_model = wv_pretrained
with open('pretrained_word2vec_model.pkl', 'wb') as f:
    pickle.dump(pretrained_model, f)

In [46]:
# Vectorize using custom Word2Vec models
def get_word2vec_vectors(model, text, is_pretrained=False):
    if is_pretrained:
        vectors = [model[word] for word in text if word in model]
    else:
        vectors = [model.wv[word] for word in text if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [48]:
# Prepare train and test sets for each model
X_train_skipgram = np.array([get_word2vec_vectors(skipgram_model, text) for text in X_train])
X_test_skipgram = np.array([get_word2vec_vectors(skipgram_model, text) for text in X_test])

X_train_cbow = np.array([get_word2vec_vectors(cbow_model, text) for text in X_train])
X_test_cbow = np.array([get_word2vec_vectors(cbow_model, text) for text in X_test])

X_train_pretrained = np.array([get_word2vec_vectors(wv_pretrained, text, is_pretrained=True) for text in X_train])
X_test_pretrained = np.array([get_word2vec_vectors(wv_pretrained, text, is_pretrained=True) for text in X_test])

In [49]:
# Train classifiers
def train_and_evaluate(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=50, random_state=1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report


In [52]:
# Skip-gram model performance
skipgram_accuracy, skipgram_report = train_and_evaluate(X_train_skipgram, X_test_skipgram, y_train, y_test)



In [53]:
# CBOW model performance
cbow_accuracy, cbow_report = train_and_evaluate(X_train_cbow, X_test_cbow, y_train, y_test)



In [54]:
# Pretrained Word2Vec model performance
pretrained_accuracy, pretrained_report = train_and_evaluate(X_train_pretrained, X_test_pretrained, y_train, y_test)

In [55]:
# Create a table of results
results = pd.DataFrame({
    'Model': ['Skip-gram', 'CBOW', 'Pretrained Word2Vec'],
    'Accuracy': [skipgram_accuracy, cbow_accuracy, pretrained_accuracy]
})

print(results)

                 Model  Accuracy
0            Skip-gram    0.8510
1                 CBOW    0.8265
2  Pretrained Word2Vec    0.8044


In [56]:
# Save the trained classifiers
with open('skipgram_classifier.pkl', 'wb') as f:
    pickle.dump(skipgram_model, f)

with open('cbow_classifier.pkl', 'wb') as f:
    pickle.dump(cbow_model, f)

with open('pretrained_classifier.pkl', 'wb') as f:
    pickle.dump(pretrained_model, f)