In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
! unzip "/content/archive.zip"

Archive:  /content/archive.zip
  inflating: top10K-TMDB-movies.csv  


In [3]:
df = pd.read_csv("/content/top10K-TMDB-movies.csv")

In [4]:
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.2+ KB


In [7]:
df.shape

(10000, 9)

In [8]:
df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,10000.0,10000.0,10000.0,10000.0
mean,161243.505,34.697267,6.62115,1547.3094
std,211422.046043,211.684175,0.766231,2648.295789
min,5.0,0.6,4.6,200.0
25%,10127.75,9.15475,6.1,315.0
50%,30002.5,13.6375,6.6,583.5
75%,310133.5,25.65125,7.2,1460.0
max,934761.0,10436.917,8.7,31917.0


In [9]:
df.isnull().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

In [11]:
df.duplicated().sum()

0

In [12]:
df.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [13]:
df = df[['id','title','overview','genre']]

In [15]:
df.head()

Unnamed: 0,id,title,overview,genre
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama,Crime"
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,Romance"
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama,Crime"
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Drama,History,War"
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"Drama,Crime"


In [16]:
df['tags'] = df['overview'] + df['genre']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['overview'] + df['genre']


In [20]:
df.head()

Unnamed: 0,id,title,overview,genre,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"Drama,Crime",In the continuing saga of the Corleone crime f...


In [21]:
df = df.drop(columns=['overview','genre'])

In [23]:
df.isnull().sum()

id        0
title     0
tags     15
dtype: int64

In [26]:
df = df.dropna()

In [27]:
df.isnull().sum()

id       0
title    0
tags     0
dtype: int64

In [58]:
df.shape

(9985, 3)

In [30]:
cv = CountVectorizer(max_features=10_000, stop_words='english')
cv

In [31]:
vector = cv.fit_transform(df['tags'].values.astype('U')).toarray()

In [32]:
vector.shape

(9985, 10000)

In [33]:
similarity = cosine_similarity(vector)

In [34]:
similarity

array([[1.        , 0.05634362, 0.13041013, ..., 0.07559289, 0.11065667,
        0.06900656],
       [0.05634362, 1.        , 0.07715167, ..., 0.        , 0.03636965,
        0.        ],
       [0.13041013, 0.07715167, 1.        , ..., 0.02300219, 0.0673435 ,
        0.09449112],
       ...,
       [0.07559289, 0.        , 0.02300219, ..., 1.        , 0.03253   ,
        0.03042903],
       [0.11065667, 0.03636965, 0.0673435 , ..., 0.03253   , 1.        ,
        0.04454354],
       [0.06900656, 0.        , 0.09449112, ..., 0.03042903, 0.04454354,
        1.        ]])

In [49]:
def recommand(movies):
    index=df[df['title']==movies].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])
    for i in distance[0:5]:
        print(df.iloc[i[0]].title)

In [47]:
recommand("Superman")

Superman
Superman: Man of Tomorrow
All Star Superman
Superman II
X-Men: Apocalypse


In [48]:
import pickle

In [59]:
pickle.dump(df, open('data.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
s = pickle.load(open('data.pkl', 'rb'))

In [54]:
model = pickle.load(open('similarity.pkl', 'rb'), errors='ignore')

In [62]:
def rec(movies):
    index=s[s['title']==movies].index[0]
    distance = sorted(list(enumerate(model[index])), reverse=True, key=lambda vector:vector[1])
    for i in distance[0:5]:
        print(s.iloc[i[0]].title)

In [69]:
rec("The New Adventures of Aladdin")

Hellraiser: Deader
The Invisible Witness
No One Gets Out Alive
Black Christmas
Devil


In [75]:
# Agar indexda xatolik chiqsa
s = s.reset_index(drop=True)

In [76]:
rec('Captain America')

Captain America
Captain America: The First Avenger
Team Thor
Captain America: The Winter Soldier
Letters from Iwo Jima
