In [1]:
import pandas as pd # type: ignore

In [2]:
df = pd.read_csv("movies.csv")
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [3]:
df.shape

(10000, 9)

In [4]:
df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,10000.0,10000.0,10000.0,10000.0
mean,161243.505,34.697267,6.62115,1547.3094
std,211422.046043,211.684175,0.766231,2648.295789
min,5.0,0.6,4.6,200.0
25%,10127.75,9.15475,6.1,315.0
50%,30002.5,13.6375,6.6,583.5
75%,310133.5,25.65125,7.2,1460.0
max,934761.0,10436.917,8.7,31917.0


In [5]:
df.isnull().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

## Selecting the features

In [6]:
df.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [7]:
df1 = df[['id', 'title', 'genre', 'overview', 'popularity', 'vote_average']]
df1.head()

Unnamed: 0,id,title,genre,overview,popularity,vote_average
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...,94.075,8.7
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second...",25.408,8.7
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o...",90.585,8.7
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...,44.761,8.6
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...,57.749,8.6


In [8]:
df1['tags'] =  df1['overview'] + df1['genre']
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['tags'] =  df1['overview'] + df1['genre']


Unnamed: 0,id,title,genre,overview,popularity,vote_average,tags
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...,94.075,8.7,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second...",25.408,8.7,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o...",90.585,8.7,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...,44.761,8.6,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...,57.749,8.6,In the continuing saga of the Corleone crime f...


In [9]:
df1 = df1.drop(columns = ['overview', 'genre'])

In [10]:
df1.head()

Unnamed: 0,id,title,popularity,vote_average,tags
0,278,The Shawshank Redemption,94.075,8.7,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,25.408,8.7,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,90.585,8.7,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,44.761,8.6,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,57.749,8.6,In the continuing saga of the Corleone crime f...


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(max_features = 10000, stop_words = 'english')

In [12]:
vector = v.fit_transform(df1['tags'].values.astype('U')).toarray()
vector.shape

(10000, 10000)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
similarity

array([[1.        , 0.05634362, 0.12888482, ..., 0.07559289, 0.11065667,
        0.06388766],
       [0.05634362, 1.        , 0.07624929, ..., 0.        , 0.03636965,
        0.        ],
       [0.12888482, 0.07624929, 1.        , ..., 0.02273314, 0.06655583,
        0.08645856],
       ...,
       [0.07559289, 0.        , 0.02273314, ..., 1.        , 0.03253   ,
        0.02817181],
       [0.11065667, 0.03636965, 0.06655583, ..., 0.03253   , 1.        ,
        0.0412393 ],
       [0.06388766, 0.        , 0.08645856, ..., 0.02817181, 0.0412393 ,
        1.        ]])

In [14]:
df1[df1['title'] == 'The Godfather']

Unnamed: 0,id,title,popularity,vote_average,tags
2,238,The Godfather,90.585,8.7,"Spanning the years 1945 to 1955, a chronicle o..."


In [15]:
distance = sorted(list(enumerate(similarity[2])), reverse=True, key = lambda vector: vector[1])
for i in distance[1:6]:
    print(df1.iloc[i[0]].title)


The Godfather: Part II
Blood Ties
Joker
Bomb City
Gotti


In [16]:
def recommend_movies(title):
    index = df1[df1['title'] == title].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key = lambda vector: vector[1])
    for i in distance[1:6]:
        print(df1.iloc[i[0]].title)


In [17]:
recommend_movies('Batman')

Batman Returns
The Batman vs. Dracula
Batman Beyond: Return of the Joker
The Dark Knight
Batman Begins


In [18]:
import pickle


In [19]:
pickle.dump(df1, open('movies_list.pkl', 'wb'))

In [20]:
pickle.dump(similarity, open('similarity', 'wb'))

In [21]:
pickle.load(open('movies_list.pkl', 'rb'))

Unnamed: 0,id,title,popularity,vote_average,tags
0,278,The Shawshank Redemption,94.075,8.7,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,25.408,8.7,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,90.585,8.7,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,44.761,8.6,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,57.749,8.6,In the continuing saga of the Corleone crime f...
...,...,...,...,...,...
9995,10196,The Last Airbender,98.322,4.7,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,12.490,4.7,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,18.333,4.6,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,15.159,4.7,A man named Farmer sets out to rescue his kidn...
