In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
df = pd.read_csv("imdb_top_1000.csv")
df

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115 min,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201 min,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118 min,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,


In [18]:
df.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [5]:
df.isna().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

We are considering the certificate column so need to handle missing values before concate it with other columns.

In [6]:
df["Certificate"] = df["Certificate"].fillna(df["Certificate"].mode()[0])

Overview column contians lots of words in it which may leads to give lower importance to other column values. so not considering

In [7]:
x = df["Series_Title"] + " " + df["Certificate"] + " " + df["Genre"] + " " + df["Director"] + " " + df["Star1"] + " " + df["Star2"]

In [8]:
stop = stopwords.words("english")
stemmer = PorterStemmer()
def preprocess(text):
    data = "".join([i for i in text.lower() if i not in string.punctuation])
    data1 = [stemmer.stem(i) for i in data.split() if i not in stop]
    return " ".join(data1)

In [9]:
x_cleaned = x.apply(preprocess)

In [10]:
vectorizer = TfidfVectorizer()
x_vector = vectorizer.fit_transform(x_cleaned)

Converting the abbove resultant sparse matrix to array

In [11]:
x_vector = x_vector.toarray()

In [12]:
movie_dictionary = {}
for i,j in zip(df["Series_Title"], x_vector):
    movie_dictionary[i] = j


In [13]:
movie_dictionary

{'The Shawshank Redemption': array([0., 0., 0., ..., 0., 0., 0.]),
 'The Godfather': array([0., 0., 0., ..., 0., 0., 0.]),
 'The Dark Knight': array([0., 0., 0., ..., 0., 0., 0.]),
 'The Godfather: Part II': array([0., 0., 0., ..., 0., 0., 0.]),
 '12 Angry Men': array([0.3496874, 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]),
 'The Lord of the Rings: The Return of the King': array([0., 0., 0., ..., 0., 0., 0.]),
 'Pulp Fiction': array([0., 0., 0., ..., 0., 0., 0.]),
 "Schindler's List": array([0., 0., 0., ..., 0., 0., 0.]),
 'Inception': array([0., 0., 0., ..., 0., 0., 0.]),
 'Fight Club': array([0., 0., 0., ..., 0., 0., 0.]),
 'The Lord of the Rings: The Fellowship of the Ring': array([0., 0., 0., ..., 0., 0., 0.]),
 'Forrest Gump': array([0., 0., 0., ..., 0., 0., 0.]),
 'Il buono, il brutto, il cattivo': array([0., 0., 0., ..., 0., 0., 0.]),
 'The Lord of the Rings: The Two Towers': array([0., 0., 0., ..., 0., 0., 0.]),
 'The Matrix': array([0., 0., 0., ..., 0.

In [14]:
# a b
# cosine similarity = a.b/|a|x|b|
# a = 2i+2j+4k
# b = i+2j+k
# a.b = 2*1+2*2+4*1 = 10
# import math
# math.sqrt(2**2+2**4+4**2) --> |a|

In [15]:
def recommend_movies(movie_name, n=5):
    if movie_name in movie_dictionary:
        movie_vector = movie_dictionary[movie_name]
        cosine_values = {}
        for movie, vector in movie_dictionary.items():
            if movie != movie_name:
                cosine_values[movie] = cosine_similarity([vector], [movie_vector])[0][0]
        return list(dict(sorted(cosine_values.items(), key = lambda x:x[1], reverse=True)[:n]))
    else:
        return None

        

In [16]:
recommend_movies("Interstellar")

['Inception',
 'Dark Waters',
 'The Prestige',
 'Batman Begins',
 'The Dark Knight Rises']

In [17]:
with open("movie_dict.pkl", "wb") as obj1:
    pickle.dump(movie_dictionary, obj1)