### Part 1: NLP

Solve spam sms detection problem using Gensim word2vec. You can use algorithm of your choice to train and evaluate the model.

In [2]:
pip install scikit-plot

Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


In [3]:
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import scikitplot as skplt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
from google.colab import files
uploaded = files.upload()

Saving spam.csv to spam.csv


In [5]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [6]:
df=df.iloc[:,[0,1]]
df.columns=['label','sms_text']
df.head(10)

Unnamed: 0,label,sms_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [7]:
# Preprocessing the dataset
df['patterns'] = df['sms_text'].apply(lambda x:' '.join(x.lower() for x in x.split()))
df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation))
df['patterns']= df['patterns'].str.replace('[^\w\s]','')
df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if  not x.isdigit()))
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop_words))
df['patterns'] = df['patterns'].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))
df['patterns'] = df.apply(lambda row: nltk.word_tokenize(row['patterns']), axis=1)

display(df)

Unnamed: 0,label,sms_text,patterns
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]"
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[2nd, time, tried, contact, u, u, å750, pound,..."
5568,ham,Will Ì_ b going to esplanade fr home?,"[ì_, b, going, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[pity, mood, soany, suggestion]"
5570,ham,The guy did some bitching but I acted like i'd...,"[guy, bitching, acted, like, id, interested, b..."


In [8]:
# Skip-gram model (sg = 1)
import time
import gensim
from gensim.models import Word2Vec

size = 1000
window = 3
min_count = 1
workers = 3
sg = 1 # The training algorithm, either Continuous Bag-of-Words (0) or Skip-gram (1).

start_time = time.time()
tokens = pd.Series(df['patterns']).values
# Train the Word2Vec Model
w2v_model = Word2Vec(tokens, min_count = min_count, size = size, workers = workers, window = window, sg = sg)
print("Time taken to train word2vec model: " + str(time.time() - start_time))

word2vec_model_file = 'word2vec_' + str(size) + '.model'
w2v_model.save(word2vec_model_file)

Time taken to train word2vec model: 4.604990005493164


In [9]:
w2v_model.most_similar("answer")

[('start', 0.9999500513076782),
 ('play', 0.9999498724937439),
 ('enjoy', 0.9999489188194275),
 ('girl', 0.9999482035636902),
 ('use', 0.9999478459358215),
 ('dis', 0.999947726726532),
 ('two', 0.9999474883079529),
 ('question', 0.9999470710754395),
 ('plan', 0.9999469518661499),
 ('next', 0.9999468326568604)]

In [10]:
# Load the model from the model file
sg_w2v_model = Word2Vec.load(word2vec_model_file)

# Total number of the words 
print("Total number of words")
print(len(sg_w2v_model.wv.vocab))

Total number of words
8355


In [11]:
# Store the vectors for train data in following file
word2vec_filename = 'all_review_word2vec.csv'

with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in df.iterrows():
        model_vector = (np.mean([sg_w2v_model[token] for token in row['patterns']], axis=0)).tolist()
        
        if index == 0:
            header = ",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        # Check if the line exists else it is vector of zeros
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(size)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

In [12]:
word2vec_df = pd.read_csv(word2vec_filename)
word2vec_df['label'] = df['label']

# Encode labels
word2vec_df = pd.get_dummies(word2vec_df, columns=['label'])

print(word2vec_df.head(10))

          0         1         2  ...       999  label_ham  label_spam
0 -0.004731 -0.052729  0.000149  ... -0.017479          1           0
1 -0.004711 -0.055825 -0.000336  ... -0.017922          1           0
2 -0.003791 -0.048261  0.000364  ... -0.016381          0           1
3 -0.006609 -0.075886 -0.000475  ... -0.024078          1           0
4 -0.006131 -0.068348  0.000341  ... -0.021893          1           0
5 -0.005194 -0.058095  0.000279  ... -0.019133          0           1
6 -0.005205 -0.054824  0.000313  ... -0.017762          1           0
7 -0.003303 -0.037369  0.000269  ... -0.012398          1           0
8 -0.004304 -0.060290 -0.000974  ... -0.021639          0           1
9 -0.005494 -0.079930 -0.000186  ... -0.027577          0           1

[10 rows x 1002 columns]


Model Trainig and Classification

In [13]:
headers = list(word2vec_df)
headers.remove('label_ham')
headers.remove('label_spam')

X = np.array(word2vec_df[headers].values.tolist())
y = np.array(word2vec_df[['label_ham', 'label_spam']].values.tolist())

In [14]:
# Split data to training (70%) and testing (30%)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("Value counts for training \n")
print(y_train[:, 0].size)
print("\n")
print("Value counts for testing \n")
print(y_test[:, 0].size)

Value counts for training 

3900


Value counts for testing 

1672


In [15]:
from sklearn.tree import  DecisionTreeClassifier

# Initialise Decision Tree
clf = DecisionTreeClassifier()
# Fit model
model = clf.fit(X_train, y_train)
# Predict testing target labels
prediction = model.predict(X_test)

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1454
           1       0.76      0.78      0.77       218

   micro avg       0.94      0.94      0.94      1672
   macro avg       0.86      0.87      0.87      1672
weighted avg       0.94      0.94      0.94      1672
 samples avg       0.94      0.94      0.94      1672



In [16]:
#5 fold cross validation)
scores = cross_val_score(model, X_train, y_train, cv=5)

In [17]:
#Accuracy
accuracy = accuracy_score(y_test, prediction)

In [18]:
print("acurracy " + str(accuracy))
print ('\n')
print('validation scores: ', scores)

acurracy 0.9389952153110048


validation scores:  [0.92948718 0.93589744 0.92307692 0.93717949 0.95      ]


Experimented with 
- wordnetlemmentizer instead of portstemmer, 
- skip-gram instead of CBOW,
- used decision tree classification to train the model,
- Evaluated using classification report.

### Part 2: Content Based Recommender

Build a system that recommends movies that are similar to a particular movie

In [None]:
# Load 'overview' feature in metadata dataset

In [19]:
from google.colab import files
uploaded = files.upload()

Saving movies_metadata.csv to movies_metadata.csv


In [21]:
movie_dataset = pd.read_csv('movies_metadata.csv')
movie_dataset.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [22]:
# I got memory error while computing cosine similarity between 45k samples.So, i have limited the samples to 30k
df = movie_dataset[['id','title','overview']][0:30000]
df.head()

Unnamed: 0,id,title,overview
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [23]:
df.shape

(30000, 3)

In [None]:
# Use your judgement to preprocess data 

In [24]:
df.isnull().sum()

id            0
title         4
overview    327
dtype: int64

In [25]:
# drop missing rows.
df = df.dropna()

In [26]:
df.isnull().sum()

id          0
title       0
overview    0
dtype: int64

In [28]:
df.shape

(29669, 3)

In [None]:
# Construct TF-IDF matrix

In [29]:
tfidf = TfidfVectorizer(stop_words='english')

In [30]:
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape 

(29669, 58562)

In [None]:
# Compute cosine similarity score between movies

In [31]:
cosine_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix) 
cosine_similarity.shape

(29669, 29669)

In [32]:
indices = pd.Series(df.index, index=df['title'])
indices = indices[~indices.index.duplicated(keep='last')]

In [33]:
indices

title
Toy Story                                0
Jumanji                                  1
Grumpier Old Men                         2
Waiting to Exhale                        3
Father of the Bride Part II              4
                                     ...  
Seventeen Again                      29995
Sweet Sixteen                        29996
The Disappearance of Garcia Lorca    29997
The Dramatics: A Comedy              29998
Up the Creek                         29999
Length: 27862, dtype: int64

In [34]:
target_movie_index = indices['Toy Story']
target_movie_index

0

In [36]:
cosine_similarity[target_movie_index]

array([1.        , 0.01560379, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [38]:
# We created a dataframe by using the target_movie_index's similarities
similarity_scores = pd.DataFrame(cosine_similarity[target_movie_index], columns=["score"]) 
similarity_scores

Unnamed: 0,score
0,1.000000
1,0.015604
2,0.000000
3,0.000000
4,0.000000
...,...
29664,0.013362
29665,0.023931
29666,0.000000
29667,0.000000


In [39]:
movie_indices = similarity_scores.sort_values("score", ascending=False)[0:11].index 

In [40]:
df['title'].iloc[movie_indices]

0                          Toy Story
15348                    Toy Story 3
2997                     Toy Story 2
10301         The 40 Year Old Virgin
24523                      Small Fry
23843    Andy Hardy's Blonde Trouble
29202                     Hot Splash
8327                       The Champ
27206     Life Begins for Andy Hardy
1071           Rebel Without a Cause
26304         You're Only Young Once
Name: title, dtype: object

In [None]:
# Take movie title as input and output 10 most similar movies

In [41]:
def get_films_by_name(movie_name, movie_indices):
    return movie_indices[movie_indices.index.str.contains(movie_name, na=False)]

In [42]:
get_films_by_name('Lord',indices)

title
Lord of Illusions                                      174
The Lord of the Rings                                 2007
The Lords of Flatbush                                 3468
Phantasm III: Lord of the Dead                        3715
Lord of the Flies                                     4821
The Lord of the Rings: The Fellowship of the Ring     4863
The Lord of the Rings: The Two Towers                 5814
Dragon Lord                                           5957
Greystoke: The Legend of Tarzan, Lord of the Apes     6839
The Lord of the Rings: The Return of the King         7000
Lord Love a Duck                                      7029
At Play in the Fields of the Lord                     7653
Lord Jim                                              8125
Something the Lord Made                               9493
Edges of the Lord                                    10044
Lords of Dogtown                                     10101
Lord of War                                       

In [43]:
def get_recommended_films(target_movie_index, movie_similarities,movies_df):
    similarity_scores = pd.DataFrame(movie_similarities[target_movie_index], columns=["score"]) 
    movie_indices = similarity_scores.sort_values("score", ascending=False)[0:11].index 
    return df['title'].iloc[movie_indices]

In [45]:
get_recommended_films(2007,cosine_similarity,df)

2023     Who's Afraid of Virginia Woolf?
23382                  Burton and Taylor
21149                         Liz & Dick
2956                       The Sandpiper
27022                  Hansel and Gretel
2425                      The Dancemaker
18337                          Newlyweds
10146              Satan's Little Helper
12591                            Redbelt
24086                          Honeymoon
5415                        Flaming Star
Name: title, dtype: object