# Import Libraries 

In [45]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_samples,silhouette_score
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler


# Import your 3 datasets

In [46]:
ds_netflix=pd.read_csv(r'D:\Road to ML\datasets\titles1.csv')
ds_amazon=pd.read_csv(r'D:\Road to ML\datasets\titles2.csv')
ds_hbo=pd.read_csv(r'D:\Road to ML\datasets\titles3.csv')

# Data preprocessing

In [47]:
ds=pd.concat([ds_netflix,ds_amazon,ds_hbo],axis=0)
ds.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6


In [48]:
ds=ds.drop_duplicates()
ds.duplicated().sum()

np.int64(0)

In [49]:
ds.drop(columns=['description','age_certification'],axis=1,inplace=True)

In [50]:
ds.head()

Unnamed: 0,id,title,type,release_year,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,51,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,1976,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,1972,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3
3,tm127384,Monty Python and the Holy Grail,MOVIE,1975,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,1967,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6


## Clean production_countries column

In [51]:
ds['production_countries'] = ds['production_countries'].str.replace(r"[\[\]']",'',regex=True)
ds['lead_production_countries']= ds['production_countries'].str.split(',').str[0]
ds['length_production_countries'] = ds['production_countries'].str.split(',').str.len()
ds['lead_production_countries']=ds['lead_production_countries'].replace('',np.nan)
ds['lead_production_countries']

0        US
1        US
2        US
3        GB
4        GB
       ... 
3289     PR
3290     PA
3291    NaN
3292    NaN
3293     US
Name: lead_production_countries, Length: 18980, dtype: object

## Clean genres column

In [52]:
ds['genres']=ds['genres'].str.replace(r"[\[\]']",'',regex=True)
ds['lead_genres']=ds['genres'].str.split(',').str[0]
ds['lead_genres']=ds['lead_genres'].replace('',np.nan)
ds['lead_genres']

0       documentation
1               drama
2               drama
3             fantasy
4                 war
            ...      
3289          romance
3290           comedy
3291           comedy
3292           comedy
3293    documentation
Name: lead_genres, Length: 18980, dtype: object

In [53]:
ds.drop(['genres','production_countries'],axis=1,inplace=True)

# Drop missing values from dataset

In [54]:
ds.dropna(inplace=True)

ds.set_index('title',inplace=True)

ds.drop(['id','imdb_id'],axis=1,inplace=True)

In [55]:
ds.head()

Unnamed: 0_level_0,type,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,lead_production_countries,length_production_countries,lead_genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Monty Python's Flying Circus,SHOW,1969,30,4.0,8.8,73424.0,17.617,8.306,GB,1,comedy
Seinfeld,SHOW,1989,24,9.0,8.9,308824.0,130.213,8.301,US,1,comedy
Knight Rider,SHOW,1982,51,4.0,6.9,34115.0,50.267,7.5,US,1,scifi
Thomas & Friends,SHOW,1984,10,24.0,6.5,5104.0,42.196,6.5,GB,1,animation
Saved by the Bell,SHOW,1989,23,5.0,7.1,35034.0,19.855,8.0,US,1,family


# perform encoding

In [56]:
dummies= pd.get_dummies(ds[['type','lead_production_countries','lead_genres']],drop_first=True)
ds_new=pd.concat([ds,dummies],axis=1)

ds_new.drop(['type','lead_production_countries','lead_genres'],axis=1,inplace=True)

In [57]:
ds_new.head()

Unnamed: 0_level_0,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,length_production_countries,lead_production_countries_AR,lead_production_countries_AT,...,lead_genres_history,lead_genres_horror,lead_genres_music,lead_genres_reality,lead_genres_romance,lead_genres_scifi,lead_genres_sport,lead_genres_thriller,lead_genres_war,lead_genres_western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Monty Python's Flying Circus,1969,30,4.0,8.8,73424.0,17.617,8.306,1,False,False,...,False,False,False,False,False,False,False,False,False,False
Seinfeld,1989,24,9.0,8.9,308824.0,130.213,8.301,1,False,False,...,False,False,False,False,False,False,False,False,False,False
Knight Rider,1982,51,4.0,6.9,34115.0,50.267,7.5,1,False,False,...,False,False,False,False,False,True,False,False,False,False
Thomas & Friends,1984,10,24.0,6.5,5104.0,42.196,6.5,1,False,False,...,False,False,False,False,False,False,False,False,False,False
Saved by the Bell,1989,23,5.0,7.1,35034.0,19.855,8.0,1,False,False,...,False,False,False,False,False,False,False,False,False,False


# Perform Scaling

In [58]:
MM= MinMaxScaler()
ds_scaled=MM.fit_transform(ds_new)
ds_scaled=pd.DataFrame(ds_scaled,columns=ds_new.columns)
ds_scaled

Unnamed: 0,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,length_production_countries,lead_production_countries_AR,lead_production_countries_AT,...,lead_genres_history,lead_genres_horror,lead_genres_music,lead_genres_reality,lead_genres_romance,lead_genres_scifi,lead_genres_sport,lead_genres_thriller,lead_genres_war,lead_genres_western
0,0.397727,0.168539,0.058824,0.9125,0.037009,0.007913,0.815870,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.625000,0.134831,0.156863,0.9250,0.155671,0.058490,0.815326,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.545455,0.286517,0.058824,0.6750,0.017194,0.022579,0.728261,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.568182,0.056180,0.450980,0.6250,0.002570,0.018954,0.619565,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.625000,0.129213,0.078431,0.7000,0.017658,0.008919,0.782609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3289,0.988636,0.146067,0.000000,0.5000,0.000028,0.002064,0.456522,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3290,0.988636,0.258427,0.000000,0.5500,0.000027,0.002077,1.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3291,0.988636,0.185393,0.000000,0.5750,0.000017,0.000377,0.021739,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3292,0.988636,0.191011,0.019608,0.3125,0.000067,0.001158,0.510870,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Apply DBSCAN

In [59]:
esp_array=[0.2,0.5,1]
min_sample_array=[5,10,30]

for esps in esp_array:
    for min_ in min_sample_array:
        clustere = DBSCAN(eps=esps,min_samples=min_).fit(ds_scaled)
        cluster_labels= clustere.labels_

        if len(cluster_labels) == 1:
            continue

        print("For espilon: ",esps,
              "For Min sample: ",min_,
              "Cluster Count is: ",len(set(cluster_labels)),
              "silhouette score is: ",silhouette_score(ds_scaled,cluster_labels))
        

For espilon:  0.2 For Min sample:  5 Cluster Count is:  75 silhouette score is:  0.4378840737098286
For espilon:  0.2 For Min sample:  10 Cluster Count is:  37 silhouette score is:  0.36601440046646755
For espilon:  0.2 For Min sample:  30 Cluster Count is:  17 silhouette score is:  0.23106054247198202
For espilon:  0.5 For Min sample:  5 Cluster Count is:  91 silhouette score is:  0.6019560501740351
For espilon:  0.5 For Min sample:  10 Cluster Count is:  56 silhouette score is:  0.5303679432698052
For espilon:  0.5 For Min sample:  30 Cluster Count is:  21 silhouette score is:  0.36228604161700484
For espilon:  1 For Min sample:  5 Cluster Count is:  93 silhouette score is:  0.6091664186394289
For espilon:  1 For Min sample:  10 Cluster Count is:  57 silhouette score is:  0.5362809971937993
For espilon:  1 For Min sample:  30 Cluster Count is:  22 silhouette score is:  0.37121300388037515


# DBSCAN with best hyperparameters

In [60]:
dbscan_model= DBSCAN(eps=1,min_samples=5).fit(ds_scaled)
print("Cluster Count: ",len(set(dbscan_model.labels_)),
      "Accuracy Score: ",silhouette_score(ds_scaled,dbscan_model.labels_))

Cluster Count:  93 Accuracy Score:  0.6091664186394289


In [61]:
ds['Cluster_name']=dbscan_model.labels_
ds

Unnamed: 0_level_0,type,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,lead_production_countries,length_production_countries,lead_genres,Cluster_name
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Monty Python's Flying Circus,SHOW,1969,30,4.0,8.8,73424.0,17.617,8.306,GB,1,comedy,0
Seinfeld,SHOW,1989,24,9.0,8.9,308824.0,130.213,8.301,US,1,comedy,1
Knight Rider,SHOW,1982,51,4.0,6.9,34115.0,50.267,7.500,US,1,scifi,2
Thomas & Friends,SHOW,1984,10,24.0,6.5,5104.0,42.196,6.500,GB,1,animation,3
Saved by the Bell,SHOW,1989,23,5.0,7.1,35034.0,19.855,8.000,US,1,family,4
...,...,...,...,...,...,...,...,...,...,...,...,...
Level Playing Field,SHOW,2021,26,1.0,5.5,60.0,4.595,5.000,US,1,documentation,35
Os Ausentes,SHOW,2021,46,1.0,5.9,59.0,4.624,10.000,BR,1,action,-1
Through Our Eyes,SHOW,2021,33,1.0,6.1,38.0,0.840,1.000,US,1,documentation,35
Sweet Life: Los Angeles,SHOW,2021,34,2.0,4.0,137.0,2.579,5.500,US,1,reality,5


# Recommandation Funtion

In [62]:
import random

def recommandation(movie_name: str):
    moive_name= movie_name.lower()
    ds['name']= ds.index.str.lower()
    moive=ds[ds['name'].str.contains(moive_name,na=False)]
    if not moive.empty:
        cluster=moive['Cluster_name'].values[0]
        moive_clusters= ds[ds['Cluster_name']==cluster]
    
    
        if len(moive_clusters) >=5:
            recommended=random.sample(list(moive_clusters.index),5)
        else:
            recommended=moive_clusters.index
    
    
        print("Recommeded movies are :")
        for m in recommended:
            print(m)
    else:
        print("Movie Does Not found un the database ")
 
    





result= recommandation('Avengers')   

Recommeded movies are :
Ballmastrz: 9009
Rainbow Rangers
Scooby-Doo! Mystery Incorporated
Gabby's Dollhouse
Go Dog Go


# Save your Dataset for App

In [63]:
ds.to_csv('Movie_Clusters.csv',index=False)