# Load Dataset

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import plotly.express as px
import re

In [3]:
df = pd.read_csv('imdb_top_1000.csv')
df.drop('Poster_Link',axis=1,inplace=True)
df.head(5)

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series_Title   1000 non-null   object 
 1   Released_Year  1000 non-null   object 
 2   Certificate    899 non-null    object 
 3   Runtime        1000 non-null   object 
 4   Genre          1000 non-null   object 
 5   IMDB_Rating    1000 non-null   float64
 6   Overview       1000 non-null   object 
 7   Meta_score     843 non-null    float64
 8   Director       1000 non-null   object 
 9   Star1          1000 non-null   object 
 10  Star2          1000 non-null   object 
 11  Star3          1000 non-null   object 
 12  Star4          1000 non-null   object 
 13  No_of_Votes    1000 non-null   int64  
 14  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(12)
memory usage: 117.3+ KB


# Data Cleansing

In [5]:
# Replace comma (,) character to space 
df['Genre'] = df['Genre'].apply(lambda x: str(x).replace(',', ' '))
df['Runtime'] = pd.to_numeric(df['Runtime'].apply(lambda x: re.sub('[a-zA-Z ]','',x)))


# Exploratory Data Analysis

In [6]:
fig = px.bar(df,x='Released_Year', title='Release Year')
fig.update_yaxes(title_text='Release Year Count')
fig.update_xaxes(title_text='Release Year')
fig.update_layout()

In [7]:
fig = px.histogram(df,x='Runtime', title='Runtime Distribution')
fig.update_xaxes(title_text='Runtime')
fig.update_yaxes(title_text='')
fig.update_layout()

In [8]:
fig = px.histogram(df,x='IMDB_Rating', title='IMDB Rating Distribution')
fig.update_xaxes(title_text='Rating')
fig.update_yaxes(title_text='')
fig.update_layout()

# Text Analysis

In [9]:
df.columns

Index(['Series_Title', 'Released_Year', 'Certificate', 'Runtime', 'Genre',
       'IMDB_Rating', 'Overview', 'Meta_score', 'Director', 'Star1', 'Star2',
       'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [10]:
df['text_data'] = df['Series_Title'] + ' ' + df['Genre'] + ' ' + df['Director'] + ' ' + df['Star1'] + ' ' + df['Star2'] + " " + df['Star3'] + ' ' +  df['Star4']
df['text_data']

0      The Shawshank Redemption Drama Frank Darabont ...
1      The Godfather Crime  Drama Francis Ford Coppol...
2      The Dark Knight Action  Crime  Drama Christoph...
3      The Godfather: Part II Crime  Drama Francis Fo...
4      12 Angry Men Crime  Drama Sidney Lumet Henry F...
                             ...                        
995    Breakfast at Tiffany's Comedy  Drama  Romance ...
996    Giant Drama  Western George Stevens Elizabeth ...
997    From Here to Eternity Drama  Romance  War Fred...
998    Lifeboat Drama  War Alfred Hitchcock Tallulah ...
999    The 39 Steps Crime  Mystery  Thriller Alfred H...
Name: text_data, Length: 1000, dtype: object

In [11]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['text_data'])
similarity = linear_kernel(tfidf_matrix,tfidf_matrix)

In [12]:
similarity

array([[1.        , 0.01750723, 0.01668373, ..., 0.00330899, 0.05268652,
        0.01142746],
       [0.01750723, 1.        , 0.03398495, ..., 0.00359207, 0.00421345,
        0.02659268],
       [0.01668373, 0.03398495, 1.        , ..., 0.00342311, 0.00401526,
        0.02534182],
       ...,
       [0.00330899, 0.00359207, 0.00342311, ..., 1.        , 0.03279488,
        0.        ],
       [0.05268652, 0.00421345, 0.00401526, ..., 0.03279488, 1.        ,
        0.10961173],
       [0.01142746, 0.02659268, 0.02534182, ..., 0.        , 0.10961173,
        1.        ]])

In [17]:
def recommendation(movie_title, top=10, similarity=similarity):
    movie_index = df[df['Series_Title'] == movie_title].index[0]
    similarity_score = list(enumerate(similarity[movie_index]))
    sorted_score = sorted(similarity_score,key=lambda x: x[1],reverse=True)[1:top+1]
    score_index = [i[0] for i in sorted_score]
    return df.iloc[score_index]

# Testing

In [18]:
movie_title = 'The Dark Knight'
top = 10
recommendation(movie_title,top)

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,text_data
63,The Dark Knight Rises,2012,UA,164,Action Adventure,8.4,Eight years after the Joker's reign of anarchy...,78.0,Christopher Nolan,Christian Bale,Tom Hardy,Anne Hathaway,Gary Oldman,1516346,448139099.0,The Dark Knight Rises Action Adventure Christ...
36,The Prestige,2006,U,130,Drama Mystery Sci-Fi,8.5,"After a tragic accident, two stage magicians e...",66.0,Christopher Nolan,Christian Bale,Hugh Jackman,Scarlett Johansson,Michael Caine,1190259,53089891.0,The Prestige Drama Mystery Sci-Fi Christophe...
155,Batman Begins,2005,UA,140,Action Adventure,8.2,"After training with his mentor, Batman begins ...",70.0,Christopher Nolan,Christian Bale,Michael Caine,Ken Watanabe,Liam Neeson,1308302,206852432.0,Batman Begins Action Adventure Christopher No...
773,Brokeback Mountain,2005,A,134,Drama Romance,7.7,The story of a forbidden and secretive relatio...,87.0,Ang Lee,Jake Gyllenhaal,Heath Ledger,Michelle Williams,Randy Quaid,323103,83043761.0,Brokeback Mountain Drama Romance Ang Lee Jake...
774,3:10 to Yuma,2007,A,122,Action Crime Drama,7.7,A small-time rancher agrees to hold a captured...,76.0,James Mangold,Russell Crowe,Christian Bale,Ben Foster,Logan Lerman,288797,53606916.0,3:10 to Yuma Action Crime Drama James Mangol...
832,Empire of the Sun,1987,U,153,Action Drama History,7.7,A young English boy struggles to survive under...,62.0,Steven Spielberg,Christian Bale,John Malkovich,Miranda Richardson,Nigel Havers,115677,22238696.0,Empire of the Sun Action Drama History Steve...
614,The Fighter,2010,UA,116,Biography Drama Sport,7.8,"Based on the story of Micky Ward, a fledgling ...",79.0,David O. Russell,Mark Wahlberg,Christian Bale,Amy Adams,Melissa Leo,340584,93617009.0,The Fighter Biography Drama Sport David O. R...
217,Ford v Ferrari,2019,UA,152,Action Biography Drama,8.1,American car designer Carroll Shelby and drive...,81.0,James Mangold,Matt Damon,Christian Bale,Jon Bernthal,Caitriona Balfe,291289,117624028.0,Ford v Ferrari Action Biography Drama James ...
692,The Man Who Would Be King,1975,PG,129,Adventure History War,7.8,Two British former soldiers decide to set them...,91.0,John Huston,Sean Connery,Michael Caine,Christopher Plummer,Saeed Jaffrey,44917,,The Man Who Would Be King Adventure History ...
600,The Big Short,2015,A,130,Biography Comedy Drama,7.8,In 2006-2007 a group of investors bet against ...,81.0,Adam McKay,Christian Bale,Steve Carell,Ryan Gosling,Brad Pitt,362942,70259870.0,The Big Short Biography Comedy Drama Adam Mc...
