## PART 1

In [1]:
import gensim.downloader as api
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np

In [2]:
model = api.load("word2vec-google-news-300")

In [3]:
words = ["tree", "car", "bike", "sea", "sound"]

similar_words = {word: model.most_similar(word, topn=5) for word in words}
for word, similar in similar_words.items():
    print(f"{word}: {similar}")

tree: [('trees', 0.8293122053146362), ('pine_tree', 0.7622087001800537), ('oak_tree', 0.731893002986908), ('evergreen_tree', 0.6926872730255127), ('fir_tree', 0.6917218565940857)]
car: [('vehicle', 0.7821096181869507), ('cars', 0.7423831224441528), ('SUV', 0.7160962224006653), ('minivan', 0.6907036900520325), ('truck', 0.6735789775848389)]
bike: [('bicycle', 0.8521308898925781), ('bikes', 0.8127913475036621), ('mountain_bike', 0.7518467307090759), ('scooter', 0.749575674533844), ('motorcycle', 0.7016158699989319)]
sea: [('ocean', 0.7643541693687439), ('seas', 0.6712585687637329), ('oceans', 0.6193016767501831), ('waters', 0.5993286371231079), ('seawaters', 0.5960041284561157)]
sound: [('sounds', 0.7409015893936157), ('sounding', 0.6120067834854126), ('boomy', 0.585667073726654), ('sonics', 0.5831336379051208), ('synthesized_strings', 0.5773767828941345)]


In [4]:
analogies = [
    ("Germany", "Hitler", "Modi"),
    ("India", "Gandhi", "Hitler"),
    ("India", "Freedom", "Bollywood"),
]

for analogy in analogies:
    a, b, c = analogy
    result = model.most_similar(positive=[a, c], negative=[b])
    print(f"{a} - {b} + {c} = {result[:5][:]}")

Germany - Hitler + Modi = [('India', 0.5639522075653076), ('Maharashtra', 0.5544657111167908), ('Karnataka', 0.5436393618583679), ('Pawar', 0.5428828597068787), ('Bangalore', 0.53938889503479)]
India - Gandhi + Hitler = [('Germany', 0.5910069346427917), ('Nazi_Germany', 0.5207065343856812), ('United_States', 0.49803826212882996), ('Nazis', 0.4878699779510498), ('Poland', 0.47855788469314575)]
India - Freedom + Bollywood = [('desi', 0.6297768950462341), ('bollywood', 0.6296355724334717), ('Mumbai', 0.6205345988273621), ('Hindi_cinema', 0.6103505492210388), ('Akshay_Kumar', 0.6102105975151062)]


In [5]:
model.most_similar(positive="India", negative="Freedom")

[('Indias', 0.5212128162384033),
 ('sub_continent', 0.505436360836029),
 ('subcontinent', 0.5039855241775513),
 ('Bangalore', 0.49730417132377625),
 ('Mumbai', 0.4909266233444214),
 ('Indiaâ_€_™', 0.4874730408191681),
 ('India.The', 0.4776294231414795),
 ('Chennai', 0.4670253098011017),
 ('Delhi', 0.4649043381214142),
 ('Gujarat', 0.46227097511291504)]

## Part 2

In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [7]:
df = pd.read_csv('IMDB Dataset.csv')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [11]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naiti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def clean_text(text):
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower().split()
    text = [word for word in text if word not in stop_words]
    return ' '.join(text)

df['cleaned_review'] = df['review'].apply(clean_text)

In [13]:
sentences = [review.split() for review in df['cleaned_review']]

In [14]:
X = sentences
y = LabelEncoder().fit_transform(df['sentiment'])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.3)

In [16]:
skipgram_model = Word2Vec(X_train, vector_size=70, window=5, min_count=5, sg=1)

In [17]:
def transform_text_to_vector(model, text):
    words = text
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [18]:
X_train = np.array([transform_text_to_vector(skipgram_model, text) for text in X_train])
y_train = LabelEncoder().fit_transform(y_train)
X_test = np.array([transform_text_to_vector(skipgram_model, text) for text in X_test])
y_test = LabelEncoder().fit_transform(y_test)

In [19]:
X_train.shape, y_train.shape

((35000, 70), (35000,))

In [20]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

In [21]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8745333333333334


In [23]:
X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.3)

In [24]:
cbow_model = Word2Vec(X_train, vector_size=100, window=5, min_count=5, sg=0)

In [25]:
X_train = np.array([transform_text_to_vector(cbow_model, text) for text in X_train])
y_train = LabelEncoder().fit_transform(y_train)
X_test = np.array([transform_text_to_vector(cbow_model, text) for text in X_test])
y_test = LabelEncoder().fit_transform(y_test)

In [26]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Custom CBoW Vectors")
print(accuracy_score(y_test, y_pred))

Custom CBoW Vectors
0.8547333333333333
