imports

In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
import joblib

Read Dataset

In [2]:
dataset_path = os.path.join(os.getcwd(), '..', 'datasets', 'dataset.csv')
movies = pd.read_csv(dataset_path)
movies = movies[['id', 'title', 'country', 'overview', 'genre', 'release_date', 'vote_average', 'certificate', 'industry']]

In [3]:
movies['rating'] = movies['vote_average']

In [4]:
movies['certificate'] = movies['certificate']

Normalize vote_average

In [5]:
movies['vote_average'] = (movies['vote_average'] - movies['vote_average'].min()) / (movies['vote_average'].max() - movies['vote_average'].min())

Combine feature rich columns to a new column metadata

In [6]:
movies['metdata'] = movies['overview'] + ' ' + movies['country'] + ' ' + movies['industry'] + ' ' + movies['genre'] + ' ' + movies['vote_average'].astype(str)

Download stop words corpus 

In [None]:
path = os.path.join(os.getcwd(), 'nltk_data')
nltk.data.path.append(path)
nltk.download('stopwords',download_dir=path)

In [8]:
stop_words = set(stopwords.words('english'))

Preprocess data to remove stopwords

In [9]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [10]:
movies['metadata'] = movies['metdata'].apply(preprocess_text)

Vectorize using TF-IDF

In [11]:
tfidf = TfidfVectorizer(max_features=15000, stop_words='english', ngram_range=(1, 2))
vector = tfidf.fit_transform(movies['metadata'].values.astype('U')).toarray()

Compute cosine similarity

In [12]:
similars = cosine_similarity(vector)

Save as Joblib files

In [None]:
joblib.dump(movies, 'movies_list.joblib')
joblib.dump(similars, 'similars_list.joblib')

Recommendation function

In [None]:
def recommend(movie_titles, num_recommendations=6):
    if not isinstance(movie_titles, list):
        movie_titles = [movie_titles]

    recommendations = []
    for movie_title in movie_titles:
        try:
            index = movies[movies['title'] == movie_title].index[0]
        except IndexError:
            print(f"Movie titled '{movie_title}' not found.")
            continue

        distances = list(enumerate(similars[index]))
        sorted_distances = sorted(distances, reverse=True, key=lambda x: x[1])

        for i in sorted_distances[1:num_recommendations + 1]:
            movie = movies.iloc[i[0]]
            recommendations.append({
                'title': movie.title,
                'id': movie.id,
                'genre': movie.genre,
            })

    unique_recommendations = {rec['id']: rec for rec in recommendations}.values()
    return list(unique_recommendations)

print(recommend(['Suicide Squad: Hell to Pay', 'The Humanity Bureau', 'Traffik'], num_recommendations=7))