In [1]:
import pandas as pd

data = './data/new/data.csv'

df = pd.read_csv(data)
df = df.drop(columns = 'id')

In [16]:
df.head(5)

Unnamed: 0,title,type,genres,averageRating,numVotes,releaseYear,genre_encoded,type_encoded
0,Miss Jerry,movie,Romance,5.4,222.0,1894.0,2020,0
1,The Corbett-Fitzsimmons Fight,movie,"Documentary, News, Sport",5.3,556.0,1897.0,1394,0
2,Bohemios,movie,unknown,3.8,21.0,1905.0,2066,0
3,The Story of the Kelly Gang,movie,"Action, Adventure, Biography",6.0,982.0,1906.0,16,0
4,The Prodigal Son,movie,Drama,5.6,31.0,1907.0,1432,0


In [9]:
df['genres'] = df['genres'].fillna("unknown")
df['averageRating'] = df['averageRating'].fillna(df['averageRating'].mean())
df['numVotes'] = df['numVotes'].fillna(df['numVotes'].median())
df['releaseYear'] = df['releaseYear'].fillna(df['releaseYear'].median())
df['type'] = df['type'].fillna("unknown")

In [11]:
df.head(25)

Unnamed: 0,title,type,genres,averageRating,numVotes,releaseYear
0,Miss Jerry,movie,Romance,5.4,222.0,1894.0
1,The Corbett-Fitzsimmons Fight,movie,"Documentary, News, Sport",5.3,556.0,1897.0
2,Bohemios,movie,unknown,3.8,21.0,1905.0
3,The Story of the Kelly Gang,movie,"Action, Adventure, Biography",6.0,982.0,1906.0
4,The Prodigal Son,movie,Drama,5.6,31.0,1907.0
5,Robbery Under Arms,movie,Drama,4.3,28.0,1907.0
6,Hamlet,movie,Drama,3.2,33.0,1908.0
7,Don Quijote,movie,Drama,4.3,23.0,1908.0
8,The Fairylogue and Radio-Plays,movie,"Adventure, Fantasy",5.2,78.0,1908.0
9,A Cultura do Cacau,movie,unknown,3.6,12.0,1909.0


In [40]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

encode = LabelEncoder()

df['genre_encoded'] = encode.fit_transform(df['genres'])
df['type_encoded'] = encode.fit_transform(df['type'])

features = np.hstack((
    df[['genre_encoded']].values,
    df[['averageRating', 'numVotes', 'releaseYear']].values,
    df[['type_encoded']].values
))

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies(movie_title, top_n=5):
    idx = df[df['title'].str.lower() == movie_title.lower()].index
    if idx.empty:
        return f"No movie found with title '{movie_title}'"
    
    idx = idx[0]
    movie_vector = features[idx].reshape(1, -1)
    sims = cosine_similarity(movie_vector, features)[0]
    sim_scores = list(enumerate(sims))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in sim_scores[1:top_n+1]]
    result = df.iloc[top_indices][['title', 'averageRating']].reset_index(drop=True)

    output = f"\n🎬 Top {top_n} recommendations for **{movie_title}**:\n\n"
    for i, row in result.iterrows():
        output += f"{i+1}. {row['title']} — ⭐ {row['averageRating']:.1f}\n"
    return output



Input Movie: Forrest Gump (8.8)

Recommended Movies :

1 The Godfather                                                   9.2
2 Breaking Bad                                                    9.5
3 Pulp Fiction                                                    8.9
4 Fight Club                                                      8.8
5 Se7en                                                           8.6
6 The Shawshank Redemption                                        9.3

