# Model Develpoment

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Load the dataset with engineered features
movies = pd.read_csv('../data/processed/movies_metadata.csv')

# Display the first few rows
movies.head(2)


  movies = pd.read_csv('../data/processed/movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,original_language,overview,popularity,poster_path,...,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western,popularity_metric,combined_text
0,False,,4.031797,"['Action', 'Thriller', 'Science Fiction', 'Mys...",http://inceptionmovie.warnerbros.com/,27205,en,"Cobb, a skilled thief who commits corporate es...",29.108149,/qmDpIHrmpJINaRKAfWQfftjCdyi.jpg,...,0,0,0,0,1,0,0,0,4.165992,"Inception Cobb, a skilled thief who commits co..."
1,False,"{'id': 263, 'name': 'The Dark Knight Collectio...",4.760158,"['Drama', 'Action', 'Crime', 'Thriller']",http://thedarkknight.warnerbros.com/dvdsite/,155,en,Batman raises the stakes in his war on crime. ...,123.167259,/1hRoyzDtpgMU7Dz4JF22RANzQO7.jpg,...,0,0,0,0,1,0,0,0,4.282292,The Dark Knight Batman raises the stakes in hi...


## Content Based Filtering
Using a content based filtering approach and the combined-text feature

In [3]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(movies['combined_text'])

In [4]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [5]:
cosine_sim.shape

(45436, 45436)

### 2.1. Create a Function to Get Movie Recommendations Based on Content


In [6]:
# Function to get movie recommendations based on content similarity
def get_content_recommendations(title, cosine_sim=cosine_sim, df=movies, top_n=10):
    # Get the index of the movie that matches the title
    idx = df[df['title'].str.lower() == title.lower()].index[0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top n most similar movies
    sim_scores = sim_scores[1:top_n+1]  # Exclude the first one because it's the movie itself
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top n most similar movies
    return df['title'].iloc[movie_indices]


In [7]:
# Test the content-based recommendation system
g_f_similar = get_content_recommendations('The Godfather')
g_f_similar.head()

43960                           A Crime
38065                  Honor Thy Father
4300                         Blood Ties
24813    Bonnie and Clyde Italian Style
39398                  Household Saints
Name: title, dtype: object

## 3. Collaborative Filtering
We'll implement a collaborative filtering approach using the `vote_average`, `vote_count`, and `popularity_metric` features.


In [13]:
# Check for NaN values in the columns
print(movies[['vote_average', 'vote_count', 'popularity_metric']].isna().sum())


vote_average         6
vote_count           6
popularity_metric    6
dtype: int64


In [14]:
# Normalize the vote_average, vote_count, and popularity_metric
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
movies.dropna(subset=['vote_average', 'vote_count', 'popularity_metric'], inplace=True)

movies[['vote_average', 'vote_count', 'popularity_metric']] = scaler.fit_transform(
    movies[['vote_average', 'vote_count', 'popularity_metric']])

# We'll use NearestNeighbors for collaborative filtering
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(movies[['vote_average', 'vote_count', 'popularity_metric']])


In [15]:
# Get recommendations based on collaborative filtering
def get_collaborative_recommendations(title, df=movies, model=model_knn, top_n=10):
    idx = df[df['title'].str.lower() == title.lower()].index[0]
    distances, indices = model.kneighbors(df[['vote_average', 'vote_count', 'popularity_metric']].iloc[idx].values.reshape(1, -1), n_neighbors=top_n+1)
    
    return df['title'].iloc[indices.flatten()[1:]]

In [16]:
get_collaborative_recommendations('The Godfather')



80                                     Gladiator
87                                     Toy Story
119                                        Brave
139    Star Wars: Episode I - The Phantom Menace
91                               The Incredibles
100                                 Wonder Woman
108                         Thor: The Dark World
115                                    Divergent
103                                  I Am Legend
112                      Silver Linings Playbook
Name: title, dtype: object

## 4 Hybrid Recommendation System

In [17]:
def hybrid_recommendation(title, content_weight=0.5, collaborative_weight=0.5, top_n=10):
    # Get content-based recommendations
    content_recs = get_content_recommendations(title)
    
    # Get collaborative recommendations
    collaborative_recs = get_collaborative_recommendations(title)
    
    # Combine recommendations by weighted averaging
    hybrid_recs = pd.concat([content_recs, collaborative_recs]).value_counts().index.tolist()
    
    # Return the top n recommendations
    return hybrid_recs[:top_n]



In [18]:
# Test the hybrid recommendation system
hybrid_recommendation('The Godfather')



['A Crime',
 'Honor Thy Father',
 'Blood Ties',
 'Bonnie and Clyde Italian Style',
 'Household Saints',
 'The Family',
 'Johnny Dangerously',
 'Made',
 'Chronicle of a Death Foretold',
 'Start Liquidation']