# movie recommendation system

Aim to create:
1. content based recommendation system

In [1]:
# import necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)

In [2]:
# import Dataset
moviesList = pd.read_csv('movies_list')
moviesDetail = pd.read_csv('movies_detail')

### preprocessing 

##### preprocessing moviesList dataset

In [3]:
moviesList.sample(50)

Unnamed: 0.1,Unnamed: 0,title,rating,url
1138,1138,The Ghost and the Tout Too,6.2,https://soap2dayhd.ru/movie/the-ghost-and-the-...
3168,3168,Zack Snyder's Justice League,8.0,https://soap2dayhd.ru/movie/zack-snyders-justi...
29107,29107,Marathon Man,7.4,https://soap2dayhd.ru/movie/marathon-man-nro7k
24278,24278,Swordfish,6.5,https://soap2dayhd.ru/movie/swordfish-ov9z
7648,7648,Art of Deception,3.3,https://soap2dayhd.ru/movie/art-of-deception-k...
14118,14118,600 Miles,5.4,https://soap2dayhd.ru/movie/600-miles-nqkm
24481,24481,Sunset Strip,5.6,https://soap2dayhd.ru/movie/sunset-strip-npzol
25180,25180,Hilary and Jackie,7.3,https://soap2dayhd.ru/movie/hilary-and-jackie-...
24399,24399,Murder on a Sunday Morning,8.0,https://soap2dayhd.ru/movie/murder-on-a-sunday...
24448,24448,Intimacy,6.1,https://soap2dayhd.ru/movie/intimacy-z0yw


In [4]:
# drop unnamed:0 column
moviesList.drop('Unnamed: 0',axis=1, inplace=True)

In [5]:
moviesList.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31520 entries, 0 to 31519
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   31520 non-null  object
 1   rating  31520 non-null  object
 2   url     31520 non-null  object
dtypes: object(3)
memory usage: 738.9+ KB


In [6]:
# lets convert ' ?' to np.nan values
moviesList.rating = moviesList.rating.replace(' ?', np.nan)

In [7]:
# Convert string into float
moviesList.rating[2]

' 5.70'

In [8]:
# lets convert rating column from string to float
moviesList['rating'] = moviesList['rating'].astype(float)

In [9]:
# check num of null values
moviesList.isnull().sum()

title       0
rating    289
url         0
dtype: int64

In [10]:
moviesList.describe()

Unnamed: 0,rating
count,31231.0
mean,5.853857
std,1.32981
min,1.0
25%,5.0
50%,6.1
75%,6.8
max,9.5


In [11]:
# fill null values with mean values
moviesList["rating"] = moviesList["rating"].fillna(value=moviesList["rating"].mean())

In [12]:
moviesList.shape

(31520, 3)

In [13]:
# check duplicate values
moviesList.duplicated().sum()

32

In [14]:
moviesList.duplicated().sum()

32

In [15]:
#Let's drop duplicated values
print('shape of dataset before removing duplicate values:',moviesList.shape)
print('number of duplicate rows:',moviesList.duplicated().sum())
moviesList = moviesList.drop_duplicates()
print('shape of dataset after removing duplicate values:',moviesList.shape)

shape of dataset before removing duplicate values: (31520, 3)
number of duplicate rows: 32
shape of dataset after removing duplicate values: (31488, 3)


In [16]:
# check num of null values
moviesList.isnull().sum()

title     0
rating    0
url       0
dtype: int64

In [17]:
moviesList.head()

Unnamed: 0,title,rating,url
0,The Hoot Owl,5.853857,https://soap2dayhd.ru/movie/the-hoot-owl-5wmm0
1,Game Changer,6.3,https://soap2dayhd.ru/movie/game-changer-60n49
2,White Hot: The Rise & Fall of Abercrombie & Fitch,5.7,https://soap2dayhd.ru/movie/white-hot-the-rise...
3,Alienoid,6.4,https://soap2dayhd.ru/movie/alienoid-ro00q
4,The Anthrax Attacks,6.0,https://soap2dayhd.ru/movie/the-anthrax-attack...


In [18]:
moviesList = moviesList.round(2)

In [19]:
moviesList

Unnamed: 0,title,rating,url
0,The Hoot Owl,5.85,https://soap2dayhd.ru/movie/the-hoot-owl-5wmm0
1,Game Changer,6.30,https://soap2dayhd.ru/movie/game-changer-60n49
2,White Hot: The Rise & Fall of Abercrombie & Fitch,5.70,https://soap2dayhd.ru/movie/white-hot-the-rise...
3,Alienoid,6.40,https://soap2dayhd.ru/movie/alienoid-ro00q
4,The Anthrax Attacks,6.00,https://soap2dayhd.ru/movie/the-anthrax-attack...
...,...,...,...
31515,The Public Enemy,7.60,https://soap2dayhd.ru/movie/the-public-enemy-5...
31516,Inspiration,6.30,https://soap2dayhd.ru/movie/inspiration-9opvx
31517,The Divorcee,6.70,https://soap2dayhd.ru/movie/the-divorcee-jpo23
31518,The Big House,7.10,https://soap2dayhd.ru/movie/the-big-house-908jm


##### moviesDetail

In [20]:
moviesDetail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31520 entries, 0 to 31519
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    31520 non-null  int64 
 1   title         31504 non-null  object
 2   release_year  31504 non-null  object
 3   description   31504 non-null  object
 4   director      30802 non-null  object
 5   production    31504 non-null  object
 6   cast          30959 non-null  object
 7   genre         31504 non-null  object
 8   URL           31520 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.2+ MB


In [21]:
# delete Unamed: 0 column
moviesDetail.drop('Unnamed: 0', axis=1, inplace=True)

In [22]:
# convert release_year into datetime format
moviesDetail["release_year"] = pd.to_datetime(moviesDetail["release_year"])

In [23]:
# we require only year 
moviesDetail["release_year"] = moviesDetail["release_year"].dt.year

In [24]:
moviesDetail.sample(50)

Unnamed: 0,title,release_year,description,director,production,cast,genre,URL
14093,Black Tar Road,2016.0,"BLACK TAR ROAD is a gritty, dark, love story ...","Amber Dawn Lee,Rob Brownstein,4 more credits",Abovo Films,"Maria Olsen, James Black, Leif Gantvoort, Dari...","Drama, Thriller, Romance, Crime, Biography",https://soap2dayhd.ru/movie/black-tar-road-m2qzz
8734,RSC Live: Romeo and Juliet,2018.0,What if your first true love was someone you'...,"Erica Whyman,Bridget Caldwell",Royal Shakespeare Company,"Raphael Sowole, Andrew French, Afolabi Alli, M...",Drama,https://soap2dayhd.ru/movie/rsc-live-romeo-and...
7044,Hollywould,2019.0,"Award winning writer, celebrity Mark Travis s...",Joshua Coates,"2020 Visions Entertainment Group, Casal Media...","Eric Roberts, Gabrielle Miller, Torrei Hart, A...","Thriller, Crime, Mystery",https://soap2dayhd.ru/movie/hollywould-xvo53
12319,UFC 219: Cyborg vs. Holm,2017.0,It looks like we don't have any Plot Summarie...,,"Demetrious Johnson, Dana White, Joe Rogan, Br...","Demetrious Johnson, Dana White, Joe Rogan, Bru...",Sport,https://soap2dayhd.ru/movie/ufc-219-cyborg-vs-...
8787,The Zen Diaries of Garry Shandling,2018.0,"Garry Shandling passed away in 2016, leaving ...",Judd Apatow,"Jeffrey Tambor, Sacha Baron Cohen, Jim Carrey...","Jeffrey Tambor, Sacha Baron Cohen, Jim Carrey,...","Drama, Comedy, Documentary, Biography",https://soap2dayhd.ru/movie/the-zen-diaries-of...
21448,Joy Ride 2: Dead Ahead,2008.0,While driving to Las Vegas for the bachelor p...,Louis Morneau,"Twentieth Century Fox, 20th Century Fox Home ...","Nicki Aycox, Nick Zano, Laura Jordan, Kyle Sch...","Thriller, Romance, Crime, Horror",https://soap2dayhd.ru/movie/joy-ride-2-dead-ah...
2529,The Exchange,2021.0,A socially awkward but highly enterprising te...,Dan Mazer,"Los Angeles Media Fund LAMF, Whos On First Fi...","Ed Oxenbould, Justin Hartley, Avan Jogia","Drama, Comedy",https://soap2dayhd.ru/movie/the-exchange-rolwq
398,Harmony,2022.0,"In the very near future Sophia, a wounded sol...",Zachary Gross,Hand Me Down Films,"Elliott Bales, Marili Kateri, Mark Frazier","Sci-Fi, Action, Horror",https://soap2dayhd.ru/movie/harmony-60y60
414,Crimes of the Future,2022.0,"It sounds just as ambitious, taking a deep di...",David Cronenberg,"Canadian Broadcasting Corporation CBC, Bell M...","Viggo Mortensen, Léa Seydoux, Kristen Stewart","Drama, Sci-Fi, Horror",https://soap2dayhd.ru/movie/crimes-of-the-futu...
5568,The Longest War,2020.0,The Emmy® and Peabody winning creators and ex...,Greg Barker,"Lisa Maddox, Breshna Musaza","Lisa Maddox, Breshna Musaza",Documentary,https://soap2dayhd.ru/movie/the-longest-war-k3174


In [25]:
# lets check null values
moviesDetail.isnull().sum()

title            16
release_year     16
description      16
director        718
production       16
cast            561
genre            16
URL               0
dtype: int64

In [26]:
# lets rename url columns for merging two dataframe
moviesDetail.rename(columns = {'URL':'url'}, inplace = True)

In [27]:
#Let's drop duplicated values
print('shape of dataset before removing duplicate values:',moviesDetail.shape)
print('number of duplicate rows:',moviesDetail.duplicated().sum())
moviesDetail = moviesDetail.drop_duplicates()
print('shape of dataset after removing duplicate values:',moviesDetail.shape)

shape of dataset before removing duplicate values: (31520, 8)
number of duplicate rows: 32
shape of dataset after removing duplicate values: (31488, 8)


In [28]:
print(moviesList.shape)
print(moviesDetail.shape)

(31488, 3)
(31488, 8)


In [29]:
# let's merge moviesList and moviesDetail
movies = pd.merge(moviesList, moviesDetail, on='url')

In [30]:
movies.shape

(31488, 10)

In [31]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31488 entries, 0 to 31487
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title_x       31488 non-null  object 
 1   rating        31488 non-null  float64
 2   url           31488 non-null  object 
 3   title_y       31472 non-null  object 
 4   release_year  31472 non-null  float64
 5   description   31472 non-null  object 
 6   director      30771 non-null  object 
 7   production    31472 non-null  object 
 8   cast          30927 non-null  object 
 9   genre         31472 non-null  object 
dtypes: float64(2), object(8)
memory usage: 2.6+ MB


In [32]:
# Lets drop title_y and url duplicate columns
movies.drop(columns={'title_y','url'}, axis=1, inplace = True)

In [33]:
# lets  rename column title_x to title
movies.rename(columns={'title_x':'title'}, inplace=True)

In [34]:
# lets make sample data for computing
print('metaData:',movies.shape)
movies = movies[movies.iloc[:,2] >2015]
print('Sample Data:',movies.shape)

metaData: (31488, 8)
Sample Data: (15661, 8)


In [36]:
movies.sample(100)

Unnamed: 0,title,rating,release_year,description,director,production,cast,genre
5019,Unknown Origins,6.1,2020.0,"Heroes do not exist. David, a young policeman...",David Galán Galindo,"Nadie es Perfecto, In Post We Trust, La Chica...","Leonardo Sbaraglia, Carlos Areces, Álex García...","Drama, Thriller, Comedy, Adventure, Action, C..."
3914,The Dry,6.8,2020.0,Before turning his high-powered shotgun on hi...,Robert Connolly,"Made Up Stories, Cornerstone Films, Arenamedia","Bruce Spence, Eric Bana, Genevieve OReilly, Mi...","Drama, Thriller, Crime, Mystery"
7503,Hoaxed,7.2,2019.0,HOAXED is an insider's look at the Fake News ...,"Scooter Downey,Jon du Toit","Donald Trump, Alex Jones, Hillary Clinton, Ga...","Donald Trump, Alex Jones, Hillary Clinton, Gav...",Documentary
2059,The Ghosts of Borley Rectory,4.1,2021.0,"1937, Essex, England. Given an opportunity to...",Steven M. Smith,"Greenway Entertainment, GlobalWatch Films, Co...","Julian Sands, Toby Wynn Davies, Colin Baker","Drama, Mystery, Horror"
12293,Phantom Thread,7.4,2017.0,"In 1950s London, confirmed bachelor Reynolds ...",Paul Thomas Anderson,"Focus Features, Perfect World Pictures, Annap...","Brian Gleeson, Daniel Day Lewis, Lesley Manvil...","Drama, Romance, Crime"
...,...,...,...,...,...,...,...,...
873,Blue Moon Ball,5.8,2022.0,"Grace Montague, a writer, returns to her home...",Tara Cowell-Plain,"Ashley Newbrough, Eric Lutz, William Knight","Ashley Newbrough, Eric Lutz, William Knight",Romance
410,Terror Trips,3.5,2022.0,"Six friends start a business, providing guide...",Jeff Seemann,"Hannah Fierman, Damian Maffei, Chaney Morrow","Hannah Fierman, Damian Maffei, Chaney Morrow",Horror
13869,Arbor Demon,4.1,2016.0,An adventurous woman with a secret from her h...,Patrick Rea,"Black Bear Studios, Producer Capital Fund","Jake Busey, Fiona Dourif, Kevin Ryan, Michelle...","Drama, Thriller, Horror"
13573,The Death of Louis XIV,6.7,2016.0,"Upon returning from a hunting expedition, Kin...",Albert Serra,"Capricci Films, Rosa Filmes, Andergraun Films","Jean Pierre Léaud, Patrick dAssumçao, Olivier ...","Drama, History, Biography"


In [37]:
#Lets remove gap between names 
movies['director'] = movies['director'].str.replace(" ","")
movies['production'] = movies['production'].str.replace(" ","")
movies['cast'] = movies['cast'].str.replace(" ","")
movies['genre'] = movies['genre'].str.replace(" ","")

In [38]:
movies[movies.production==movies.cast].shape[0]

2440

In [39]:
# Let's drop rows which has same values for production and cast
movies = movies.loc[movies.production!=movies.cast]
movies.shape

(13221, 8)

In [40]:
# lets convert string into list
movies['director'] = movies['director'].str.split(',').tolist()
movies['production'] = movies['production'].str.split(',').tolist()
movies['cast'] = movies['cast'].str.split(',').tolist()
movies['genre'] = movies['genre'].str.split(',').tolist()

In [41]:
# lets check for null values
movies.isnull().sum()

title             0
rating            0
release_year      0
description       0
director        311
production        0
cast            468
genre             0
dtype: int64

In [42]:
# lets drop null values
print(movies.shape)
movies.dropna(inplace=True)
movies.shape

(13221, 8)


(12621, 8)

In [43]:
movies['tags'] = movies['director'] + movies['production'] + movies['cast'] + movies['genre']

In [44]:
# Lets keep required columns
# movies.drop(columns={'rating', 'release_year', 'director', 'production', 'cast', 'genre'}, axis=1, inplace=True)

In [45]:
movies.tags = [','.join(map(str, l)) for l in movies['tags']]
movies.head()

Unnamed: 0,title,rating,release_year,description,director,production,cast,genre,tags
0,The Hoot Owl,5.85,2022.0,A group of friends fight for survival when th...,"[JasonRader, JasonVonGodi]",[VanishingTwinPictures],"[CarlBailey, AugustineFrizzell, JDBrown]",[Horror],"JasonRader,JasonVonGodi,VanishingTwinPictures,..."
2,White Hot: The Rise & Fall of Abercrombie & Fitch,5.7,2022.0,"In the late '90s and early '00s, Abercrombie ...",[AlisonKlayman],[SecondNature],"[Keefe, BenjaminOapos, BobbyBlanski, RyanDaharsh]",[Documentary],"AlisonKlayman,SecondNature,Keefe,BenjaminOapos..."
3,Alienoid,6.4,2022.0,The door of time opens between the swordsman ...,[Dong-hoonChoi],[CaperFilm],"[LeeHanee, YooJaeMyung, KimEuisung]","[Adventure, Action, Fantasy]","Dong-hoonChoi,CaperFilm,LeeHanee,YooJaeMyung,K..."
4,The Anthrax Attacks,6.0,2022.0,The 2001 anthrax attacks on the United States...,[DanKrauss],"[NetflixStudios, BBCStudios]","[ClarkGregg, BrigitteKaliCanales, ReganBurns]","[Crime, Documentary]","DanKrauss,NetflixStudios,BBCStudios,ClarkGregg..."
5,Out of Office,5.0,2022.0,A young woman discovers that keeping her job ...,[PaulLieberstein],"[CBSTelevisionStudios, CBSStudios, BigBreakfast]","[KenJeong, MilanaVayntrub, JayPharoah]",[Comedy],"PaulLieberstein,CBSTelevisionStudios,CBSStudio..."


In [46]:
# lets seperate words
movies['tags'] = movies['tags'].str.replace(','," ")

In [47]:
movies['description'][900]

' Four teens are swept into the adventure of a lifetime involving a legendary creature and a mystery from the past that will change their lives forever. '

In [49]:
# lets add description
movies['description'] = movies.description.str.replace('[#,@,&,.,'',\,",:,-,]', '')

  movies['description'] = movies.description.str.replace('[#,@,&,.,'',\,",:,-,]', '')


In [50]:
movies['tags'] = movies['tags'] + movies['description']

In [51]:
# Convert the value of 'tags' column to lowercase
movies['tags'] = movies['tags'].str.lower()

In [52]:
# Let's reset index
movies.reset_index(inplace = True, drop = True)

In [53]:
import nltk

In [54]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [55]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [56]:
movies['tags'] = movies.tags.apply(stem)

In [57]:
movies['tags'][1]

"alisonklayman secondnatur keef benjaminoapo bobbyblanski ryandaharsh documentari in the late '90 and earli '00 abercrombi and fitch wa the first stop for mani shopper on their trip to the mall shirtless jock stood guard at store entranc sell a potent mix of sex and wholesom puls danc beat and the brand' fierc scent drew in hord of young peopl hope to buy themselv a seat at the cool kids' tabl led by outspoken ceo mike jeffri af cash in on an all-american imag and enshrin it cloth as must-hav for the new millennium but over time revel of exclusionari market and discriminatori hire practic began to engulf the white hot brand in scandal featur interview with dozen of former employe execut and model white hot the rise and fall of abercrombi and fitch unravel the complex histori of the icon brand that influenc an entir gener"

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=15000,stop_words='english')

In [59]:
vector =  cv.fit_transform(movies.tags).toarray()

In [60]:
vector.shape

(12621, 15000)

In [61]:
np.set_printoptions(threshold=np.inf)

cv.get_feature_names()



['10',
 '100',
 '10000',
 '100000',
 '10th',
 '11',
 '11th',
 '12',
 '127',
 '12th',
 '12western',
 '13',
 '13film',
 '13th',
 '14',
 '15',
 '16',
 '17',
 '17th',
 '18',
 '1800',
 '1895film',
 '18th',
 '19',
 '1919',
 '1920',
 '1926',
 '1930',
 '1936',
 '1939',
 '1940',
 '1941',
 '1942',
 '1944',
 '1945',
 '1950',
 '1951',
 '1953',
 '1955',
 '1957',
 '1958',
 '1959',
 '1960',
 '1961',
 '1962',
 '1963',
 '1964',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '19th',
 '1morecredit',
 '1st',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '2022',
 '2050',
 '20th',
 '20th

In [62]:
from sklearn.metrics.pairwise import cosine_similarity

In [63]:
similarity = cosine_similarity(vector)

In [64]:
def recommend(movie):
    index = movies[movies['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:11]:
        print(movies.iloc[i[0]].title)

In [71]:
recommend('Thor: Love and Thunder')

Thor: Ragnarok
Valhalla - The Legend of Thor
Mystery of the Kingdom of God
Thor: God of Thunder
Ancestral World
Chickenhare and the Hamster of Darkness
Walking with Herb
Dino King 3D: Journey to Fire Mountain
Lupin III: The First
The Monkey King


In [66]:
# 'Thor: Love and Thunder'
# 'Avengers: Endgame'
# 'Avengers: Infinity War'
# 'Spider-Man™: Far From Home'

In [67]:
import pickle
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.dump(movies, open('movies.pkl', 'wb'))