In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load and preprocess the dataset
df = pd.read_csv("imdb_top_1000.csv", engine="python")
df = df[["Series_Title", "Genre",]].dropna().reset_index(drop=True)

# Preprocess the genres to create a text corpus
df['Genre'] = df['Genre'].str.replace('[', '', regex=False).str.replace(']', '', regex=False).str.replace("'", "", regex=False).str.replace(',', ' ', regex=False)
df['Genre'] = df['Genre'].str.lower()

tfidf = TfidfVectorizer(stop_words='english')


tfidf_matrix = tfidf.fit_transform(df['Genre'])


similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend_by_genre(genre, num_recommendations=10):

    filtered_df = df[df['Genre'].str.contains(genre.lower(), case=False)]
    
    if filtered_df.empty:
        return f"No movies found for genre '{genre}'."
    

    recommended_movies = []

    for idx in filtered_df.index:

        sim_scores = list(enumerate(similarity_matrix[idx]))
        

        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        

        sim_scores = sim_scores[1:num_recommendations + 1]  # Exclude the input movie itself
        

        movie_indices = [i[0] for i in sim_scores]
        top_movies = df['Series_Title'].iloc[movie_indices].tolist()
        recommended_movies.extend(top_movies)
    

    recommended_movies = list(set(recommended_movies))
    
    return recommended_movies[:num_recommendations]


recommended_movies = recommend_by_genre("Crime", num_recommendations=10)
print("Movie Recommended In This Genre:\n", recommended_movies)

Movie Recommended In This Genre:
 ['Rocky', 'Les choristes', 'The Innocents', 'Chung Hing sam lam', 'The Longest Day', 'Lawrence of Arabia', 'It Happened One Night', 'The Taking of Pelham One Two Three', 'Casino', 'How to Train Your Dragon']
