# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv('../data/processed.csv')

In [3]:
df.head()

Unnamed: 0,id,title,weighted_rating,description
0,19995,Avatar,7.05058,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,6.665668,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,6.23955,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,7.346536,Following the death of District Attorney Harve...
4,49529,John Carter,6.096711,"John Carter is a war-weary, former military ca..."


# Lowercasing

In [4]:
df['description'] = df['description'].str.lower()

In [5]:
df.head()

Unnamed: 0,id,title,weighted_rating,description
0,19995,Avatar,7.05058,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,6.665668,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,6.23955,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,7.346536,following the death of district attorney harve...
4,49529,John Carter,6.096711,"john carter is a war-weary, former military ca..."


# Remove Punctuation

In [6]:
def remove_punc(text):
    return text.translate(str.maketrans('','',string.punctuation))
df['description'] = df['description'].apply(remove_punc)

In [7]:
df.head()

Unnamed: 0,id,title,weighted_rating,description
0,19995,Avatar,7.05058,in the 22nd century a paraplegic marine is dis...
1,285,Pirates of the Caribbean: At World's End,6.665668,captain barbossa long believed to be dead has ...
2,206647,Spectre,6.23955,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,7.346536,following the death of district attorney harve...
4,49529,John Carter,6.096711,john carter is a warweary former military capt...


# Removing Stop Words

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANIKET\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
stopword_list = stopwords.words('english')

In [10]:
def remove_stopwords(text):
    filtered_word = [i for i in text.split() if i not in stopword_list]
    return ' '.join(filtered_word)

In [11]:
df['description'] = df['description'].apply(remove_stopwords)

In [12]:
df.head()

Unnamed: 0,id,title,weighted_rating,description
0,19995,Avatar,7.05058,22nd century paraplegic marine dispatched moon...
1,285,Pirates of the Caribbean: At World's End,6.665668,captain barbossa long believed dead come back ...
2,206647,Spectre,6.23955,cryptic message bond’s past sends trail uncove...
3,49026,The Dark Knight Rises,7.346536,following death district attorney harvey dent ...
4,49529,John Carter,6.096711,john carter warweary former military captain w...


# Tokenization & Lemmatization

In [13]:
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ANIKET\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ANIKET\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
wnl = WordNetLemmatizer()
def process(text):
    tokens = word_tokenize(text)
    filtered_tokens = [wnl.lemmatize(word) for word in tokens]
    return filtered_tokens
df['tokens'] = df['description'].apply(process)

In [15]:
df.head()

Unnamed: 0,id,title,weighted_rating,description,tokens
0,19995,Avatar,7.05058,22nd century paraplegic marine dispatched moon...,"[22nd, century, paraplegic, marine, dispatched..."
1,285,Pirates of the Caribbean: At World's End,6.665668,captain barbossa long believed dead come back ...,"[captain, barbossa, long, believed, dead, come..."
2,206647,Spectre,6.23955,cryptic message bond’s past sends trail uncove...,"[cryptic, message, bond, ’, s, past, sends, tr..."
3,49026,The Dark Knight Rises,7.346536,following death district attorney harvey dent ...,"[following, death, district, attorney, harvey,..."
4,49529,John Carter,6.096711,john carter warweary former military captain w...,"[john, carter, warweary, former, military, cap..."


# Train Word2Vec Model

In [16]:
cbow_model = Word2Vec(sentences=df['tokens'],vector_size=100,window=5,min_count=1,sg=0,alpha=0.02,min_alpha=0.0007,epochs=80)

In [17]:
print(len(cbow_model.wv))

38672


In [18]:
cbow_model.wv.most_similar('batman')

[('annihilation', 0.589390218257904),
 ('avenger', 0.585176944732666),
 ('joker', 0.570216178894043),
 ('izmir', 0.5553566813468933),
 ('catburglar', 0.5467385053634644),
 ('ultron', 0.5466200113296509),
 ('enacting', 0.5465300679206848),
 ('magna', 0.5450161099433899),
 ('symbol', 0.5445861220359802),
 ('pharaoh', 0.5439357757568359)]

# Average Vector for Movies

In [19]:
def generate_avg(tokens):
    vectors = [cbow_model.wv[word] for word in tokens if word in cbow_model.wv]
    if not vectors:
        return np.zeros(cbow_mpdel.vector_size)
    return np.mean(vectors,axis=0)
df['vectors'] = df['tokens'].apply(generate_avg)

In [20]:
df.head()

Unnamed: 0,id,title,weighted_rating,description,tokens,vectors
0,19995,Avatar,7.05058,22nd century paraplegic marine dispatched moon...,"[22nd, century, paraplegic, marine, dispatched...","[-0.16202632, 0.12982473, -0.35071015, -0.1620..."
1,285,Pirates of the Caribbean: At World's End,6.665668,captain barbossa long believed dead come back ...,"[captain, barbossa, long, believed, dead, come...","[-0.373258, 0.038260866, -0.26240003, -0.26230..."
2,206647,Spectre,6.23955,cryptic message bond’s past sends trail uncove...,"[cryptic, message, bond, ’, s, past, sends, tr...","[-0.2768787, -0.5192937, -0.8795872, -0.392900..."
3,49026,The Dark Knight Rises,7.346536,following death district attorney harvey dent ...,"[following, death, district, attorney, harvey,...","[0.15796813, 0.19392867, -0.11643412, -0.15233..."
4,49529,John Carter,6.096711,john carter warweary former military captain w...,"[john, carter, warweary, former, military, cap...","[-0.07674223, 0.0494658, -0.48689413, -0.46356..."


In [21]:
def combine_vector_with_rating(row):
    vector = row['vectors']
    rating = row['weighted_rating']
    return np.append(vector,rating)
df['combined_vector'] = df.apply(combine_vector_with_rating,axis=1)

In [22]:
df.head()

Unnamed: 0,id,title,weighted_rating,description,tokens,vectors,combined_vector
0,19995,Avatar,7.05058,22nd century paraplegic marine dispatched moon...,"[22nd, century, paraplegic, marine, dispatched...","[-0.16202632, 0.12982473, -0.35071015, -0.1620...","[-0.1620263159275055, 0.12982472777366638, -0...."
1,285,Pirates of the Caribbean: At World's End,6.665668,captain barbossa long believed dead come back ...,"[captain, barbossa, long, believed, dead, come...","[-0.373258, 0.038260866, -0.26240003, -0.26230...","[-0.37325799465179443, 0.038260865956544876, -..."
2,206647,Spectre,6.23955,cryptic message bond’s past sends trail uncove...,"[cryptic, message, bond, ’, s, past, sends, tr...","[-0.2768787, -0.5192937, -0.8795872, -0.392900...","[-0.2768787145614624, -0.5192937254905701, -0...."
3,49026,The Dark Knight Rises,7.346536,following death district attorney harvey dent ...,"[following, death, district, attorney, harvey,...","[0.15796813, 0.19392867, -0.11643412, -0.15233...","[0.15796813368797302, 0.19392867386341095, -0...."
4,49529,John Carter,6.096711,john carter warweary former military captain w...,"[john, carter, warweary, former, military, cap...","[-0.07674223, 0.0494658, -0.48689413, -0.46356...","[-0.07674223184585571, 0.04946580156683922, -0..."


In [23]:
df['vectors'][0].shape

(100,)

In [24]:
df['combined_vector'][0].shape

(101,)

# Saving Model

In [25]:
df[['id','title','combined_vector']].to_pickle('../models/movies.pkl')