In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("/content/imdb_top_1000.csv")
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [4]:
movies = df[[
    'Series_Title',
    'Overview',
    'Genre',
    'Director',
    'Star1',
    'Star2',
    'Star3',
    'Star4'
]]
movies.head()

Unnamed: 0,Series_Title,Overview,Genre,Director,Star1,Star2,Star3,Star4
0,The Shawshank Redemption,Two imprisoned men bond over a number of years...,Drama,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler
1,The Godfather,An organized crime dynasty's aging patriarch t...,"Crime, Drama",Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton
2,The Dark Knight,When the menace known as the Joker wreaks havo...,"Action, Crime, Drama",Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine
3,The Godfather: Part II,The early life and career of Vito Corleone in ...,"Crime, Drama",Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton
4,12 Angry Men,A jury holdout attempts to prevent a miscarria...,"Crime, Drama",Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler


In [5]:
movies.isnull().sum()
movies.fillna('', inplace=True)

### Clean Text Data

In [6]:
def clean_text(text):
    return text.lower().replace(" ", "")
movies['Genre'] = movies['Genre'].apply(clean_text)
movies['Director'] = movies['Director'].apply(clean_text)
movies['Star1'] = movies['Star1'].apply(clean_text)
movies['Star2'] = movies['Star2'].apply(clean_text)
movies['Star3'] = movies['Star3'].apply(clean_text)
movies['Star4'] = movies['Star4'].apply(clean_text)


### Create Combined Feature Column

In [7]:
movies['content'] = (
    movies['Overview'] + ' ' +
    movies['Genre'] + ' ' +
    movies['Director'] + ' ' +
    movies['Star1'] + ' ' +
    movies['Star2'] + ' ' +
    movies['Star3'] + ' ' +
    movies['Star4']
)
movies[['Series_Title', 'content']].head()

Unnamed: 0,Series_Title,content
0,The Shawshank Redemption,Two imprisoned men bond over a number of years...
1,The Godfather,An organized crime dynasty's aging patriarch t...
2,The Dark Knight,When the menace known as the Joker wreaks havo...
3,The Godfather: Part II,The early life and career of Vito Corleone in ...
4,12 Angry Men,A jury holdout attempts to prevent a miscarria...


## TF-IDF Vectorization

In [8]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(movies['content'])
tfidf_matrix.shape

(1000, 5000)

## Compute Cosine Similarity

In [9]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1.        , 0.0034899 , 0.00293025, ..., 0.003208  , 0.00321962,
        0.        ],
       [0.0034899 , 1.        , 0.02575002, ..., 0.00338073, 0.00339297,
        0.01950629],
       [0.00293025, 0.02575002, 1.        , ..., 0.00283858, 0.00284887,
        0.0081891 ],
       ...,
       [0.003208  , 0.00338073, 0.00283858, ..., 1.        , 0.04402684,
        0.        ],
       [0.00321962, 0.00339297, 0.00284887, ..., 0.04402684, 1.        ,
        0.03791639],
       [0.        , 0.01950629, 0.0081891 , ..., 0.        , 0.03791639,
        1.        ]])

### Create Movie Index Mapping

In [10]:
movie_index = pd.Series(
    movies.index,
    index=movies['Series_Title']
).drop_duplicates()
movie_index.head()

Unnamed: 0_level_0,0
Series_Title,Unnamed: 1_level_1
The Shawshank Redemption,0
The Godfather,1
The Dark Knight,2
The Godfather: Part II,3
12 Angry Men,4


# Recommendation Function

In [11]:
def recommend_movies(movie_title, top_n=5):
    if movie_title not in movie_index:
        return "Movie not found in dataset"

    idx = movie_index[movie_title]

    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    top_movies = similarity_scores[1:top_n+1]

    return [movies.iloc[i[0]]['Series_Title'] for i in top_movies]

# Test the Model

In [20]:
recommend_movies("Interstellar")

['The Martian', 'The Avengers', 'Gattaca', 'The Dark Knight Rises', 'Aliens']

In [23]:
recommend_movies("The Dark Knight")

['Batman Begins',
 'The Dark Knight Rises',
 'The Prestige',
 'Joker',
 'Kill Bill: Vol. 1']