In [165]:
import numpy as np
import pandas as pd

In [166]:
# dataset path
path = "Movie_Dataset.csv"

# reading the dataset using pandas and printing out first 5 rows
df = pd.read_csv(path)
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [167]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


In [168]:
# dropping columns we dont need for recommendation system
df.drop(['Poster_Link', 'Released_Year', 'Gross', 'Runtime', 'Certificate', 'No_of_Votes'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Series_Title  1000 non-null   object 
 1   Genre         1000 non-null   object 
 2   IMDB_Rating   1000 non-null   float64
 3   Overview      1000 non-null   object 
 4   Meta_score    843 non-null    float64
 5   Director      1000 non-null   object 
 6   Star1         1000 non-null   object 
 7   Star2         1000 non-null   object 
 8   Star3         1000 non-null   object 
 9   Star4         1000 non-null   object 
dtypes: float64(2), object(8)
memory usage: 78.3+ KB


In [169]:
# handling missing 'Meta_Score' values by filling them with median
meta_median = df['Meta_score'].median()
df['Meta_score'] = df['Meta_score'].fillna(meta_median)
df.isnull().sum()

Series_Title    0
Genre           0
IMDB_Rating     0
Overview        0
Meta_score      0
Director        0
Star1           0
Star2           0
Star3           0
Star4           0
dtype: int64

In [170]:
# combine all stars into one column seprated by comma
# dropping nan values and converting to string for later use
df['Stars'] = df[['Star1', 'Star2', 'Star3', 'Star4']].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1
)

In [171]:
# each row is a movie
# each column is a possible feature, director or actors
# creates a matrix where 1 means it has that feature and 0 means it doesn't
# pd.get_dummies creates a one hot encoded rows for each feature
# concat is used to combine all features side by side (axis=1)
features = pd.concat([
    # seprate different genres into multiple columns witih explode()
    # and combines back to one row per movie with groupby()
    pd.get_dummies(df['Genre'].explode()).groupby(level=0).max(),
    pd.get_dummies(df['Director'], prefix='Director'),
    pd.get_dummies(df[['Star1', 'Star2', 'Star3', 'Star4']].stack()).groupby(level=0).max()
], axis=1)

In [172]:
def find_similar(movie_title=None, 
                 genre=None,
                 director=None, 
                 star=None, 
                 min_rating=None, 
                 min_metascore=None, 
                 n=5):
    
    # Search by movie title
    # how it works? it movie title is included it cleans the input compare that input to out dataframe
    # in column 'Series_Title' finds the exact match and stores it in movie match
    # now it gets the index of that movie (row number) and fetches its features
    if movie_title:
        search_title = movie_title.strip().lower()
        movie_match = df[df['Series_Title'].str.strip().str.lower() == search_title]
        movie_index = movie_match.index[0]
        comparison_vec = features.iloc[movie_index]


    # if no movie title is included:
    # instead of searching for an existing movie in out dataframe it builds comparison vec based on user input
    else:
        # creates a pandas series with all 0s
        comparison_vec = pd.Series(0, index=features.columns)
        
        # finds all columns in features that contain the input genre ans set them to 1
        if genre:
            matching_genres = [col for col in features.columns if genre in col]
            comparison_vec[matching_genres] = 1
        
        # same thing with genre
        if director:
            director_col = f"Director_{director}"
            if director_col in features.columns:
                comparison_vec[director_col] = 1
        
        # same thing with genre
        if star:
            matching_actors = [col for col in features.columns if star in col]
            comparison_vec[matching_actors] = 1

    # this measures how aligned two vectors are
    dot_product = features.dot(comparison_vec)
    norm_features = np.linalg.norm(features)
    norm_comparison = np.linalg.norm(comparison_vec)
    similarity_score = dot_product / (norm_features * norm_comparison)


    # add a new column to out dataframe for similarity score
    df['Match Score'] = similarity_score
    
    # create a mask of all movies with all included as 'True'
    # if rating and meta score are provided it filters out those lower than the threshold
    quality_mask = pd.Series(True, index=df.index)
    if min_rating:
        quality_mask &= df['IMDB_Rating'] >= min_rating
    if min_metascore:
        quality_mask &= df['Meta_score'] >= min_metascore
    
    # keep rows that pass the filters in our originla dataframe
    filtered_results = df[quality_mask]
    
    # exclude the searched movie 
    # compare each Series_Title to search_title and drop the exact matches
    if movie_title:
        filtered_results = filtered_results[
            filtered_results['Series_Title'].str.strip().str.lower() != search_title
        ]
    
    # Return the best matches
    return (
        filtered_results
        .sort_values('Match Score', ascending=False)    # sort movies by similarity score (highest first)
        .head(n)    # show top 5 matches (can be changed)

        # keep these columns to display
        [['Series_Title', 'Genre', 'Director', 'Stars', 'IMDB_Rating', 'Meta_score', 'Match Score']]

        # reset index for cleaner look
        .reset_index(drop=True)
    )

In [173]:
# Example for when a movie title is provided:
find_similar(movie_title="Il buono, il brutto, il cattivo", min_rating=8.0)

Unnamed: 0,Series_Title,Genre,Director,Stars,IMDB_Rating,Meta_score,Match Score
0,Million Dollar Baby,"Drama, Sport",Clint Eastwood,"Hilary Swank, Clint Eastwood, Morgan Freeman, ...",8.1,86.0,0.005272
1,Gran Torino,Drama,Clint Eastwood,"Clint Eastwood, Bee Vang, Christopher Carley, ...",8.1,72.0,0.005272
2,Once Upon a Time in the West,Western,Sergio Leone,"Henry Fonda, Charles Bronson, Claudia Cardinal...",8.5,80.0,0.005272
3,Per un pugno di dollari,"Action, Drama, Western",Sergio Leone,"Clint Eastwood, Gian Maria Volontè, Marianne K...",8.0,65.0,0.005272
4,Per qualche dollaro in più,Western,Sergio Leone,"Clint Eastwood, Lee Van Cleef, Gian Maria Volo...",8.3,74.0,0.005272


In [174]:
# Example for a movie title is not provided:
find_similar(genre="Action", director="Sergio Leone")

Unnamed: 0,Series_Title,Genre,Director,Stars,IMDB_Rating,Meta_score,Match Score
0,Per un pugno di dollari,"Action, Drama, Western",Sergio Leone,"Clint Eastwood, Gian Maria Volontè, Marianne K...",8.0,65.0,0.003939
1,Blade Runner,"Action, Sci-Fi, Thriller",Ridley Scott,"Harrison Ford, Rutger Hauer, Sean Young, Edwar...",8.1,84.0,0.001969
2,Enter the Dragon,"Action, Crime, Drama",Robert Clouse,"Bruce Lee, John Saxon, Jim Kelly, Ahna Capri",7.7,83.0,0.001969
3,Sholay,"Action, Adventure, Comedy",Ramesh Sippy,"Sanjeev Kumar, Dharmendra, Amitabh Bachchan, A...",8.2,79.0,0.001969
4,The Magnificent Seven,"Action, Adventure, Western",John Sturges,"Yul Brynner, Steve McQueen, Charles Bronson, E...",7.7,74.0,0.001969


Resources:  
https://datastax.medium.com/how-to-implement-cosine-similarity-in-python-505e8ec1d823   

