In [1]:
%pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("netflix_titles.csv")
df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


In [4]:
df.shape

(8807, 12)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [6]:
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [7]:
#Plenty of null in director, cast and country. Director nulls can be filled with UNKNOWN since it wont be used in feature prediction. Too many of its data is missing.

In [8]:
df['director'] = df['director'].fillna('UNKNOWN')
print(f"Number of null values in director column after filling: {df['director'].isnull().sum()}")

Number of null values in director column after filling: 0


In [9]:
df['cast'] = df['cast'].fillna('UNKNOWN')
print(f"Number of null values in cast column after filling: {df['cast'].isnull().sum()}")

Number of null values in cast column after filling: 0


In [10]:
df['country'] = df['country'].fillna('UNKNOWN')
print(f"Number of null values in country column after filling: {df['country'].isnull().sum()}")

Number of null values in country column after filling: 0


In [11]:
df.isnull().sum()

show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added      10
release_year     0
rating           4
duration         3
listed_in        0
description      0
dtype: int64

In [12]:
df['date_added'].value_counts(5)

date_added
January 1, 2020      0.012391
November 1, 2019     0.010117
March 1, 2018        0.008526
December 31, 2019    0.008412
October 1, 2018      0.008071
                       ...   
December 4, 2016     0.000114
November 21, 2016    0.000114
November 19, 2016    0.000114
November 17, 2016    0.000114
January 11, 2020     0.000114
Name: proportion, Length: 1767, dtype: float64

In [13]:
#convert date_added to datetime
# First clean the string by stripping whitespace
df['date_added'] = df['date_added'].str.strip()
# Then convert to datetime
df['date_added'] = pd.to_datetime(df['date_added'])



In [14]:
#fill the null with most frequent date
most_frequent = df['date_added'].mode()[0]
df['date_added'] = df['date_added'].fillna(most_frequent)

In [15]:
#check at rating null values
df['release_year'].value_counts()

release_year
2018    1147
2017    1032
2019    1030
2020     953
2016     902
        ... 
1959       1
1925       1
1961       1
1947       1
1966       1
Name: count, Length: 74, dtype: int64

In [16]:
fig = px.histogram(df, 
                   x='release_year',
                   title='Distribution of Netflix Content by Release Year',
                   nbins=50)

fig.update_layout(
    xaxis_title="Release Year",
    yaxis_title="Count",
    bargap=0.1
)

fig.show()

In [17]:
median = df['release_year'].median()
df['release_year'] = df['release_year'].fillna(median)

In [18]:
df['rating'].value_counts()

rating
TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: count, dtype: int64

In [19]:
# Drop rows with null ratings and reset the index
df = df.dropna(subset=['rating']).reset_index(drop=True)

# Verify the drop
print(f"Number of rows after dropping null ratings: {len(df)}")
print(f"Number of null values in rating column: {df['rating'].isnull().sum()}")

Number of rows after dropping null ratings: 8803
Number of null values in rating column: 0


In [20]:
fig = px.histogram(df, 
                   x='duration',
                   title='Distribution of Netflix Content by duration',
                   nbins=50)

fig.update_layout(
    xaxis_title="duration",
    yaxis_title="Count",
    bargap=0.1
)

fig.show()

In [21]:

# Extract numeric duration and create duration type column
df['duration_num'] = df['duration'].str.extract('(\d+)').astype(float)
df['duration_type'] = df['duration'].str.extract('([a-zA-Z]+)').fillna('unknown')

# Convert durations to minutes
df['duration_cleaned'] = np.where(
    df['duration_type'].str.contains('min'), 
    df['duration_num'],
    df['duration_num'] * 45  # Assuming each episode is approximately 45 minutes
)

# Display the value counts of cleaned duration
df['duration_cleaned'].value_counts().head()


invalid escape sequence '\d'


invalid escape sequence '\d'


invalid escape sequence '\d'



duration_cleaned
45.0     1801
90.0      577
135.0     238
94.0      146
97.0      146
Name: count, dtype: int64

In [22]:
df['duration_cleaned'].fillna(df['duration_cleaned'].median(), inplace=True)  


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [23]:
fig = px.histogram(df, 
                   x='duration_cleaned',
                   title='Distribution of Netflix Content by duration',
                   nbins=50)

fig.update_layout(
    xaxis_title="duration_cleaned",
    yaxis_title="Count",
    bargap=0.1
)

fig.show()

In [24]:
df.drop(columns=['duration', 'duration_num', 'duration_type'], inplace=True)


In [25]:
df.isnull().sum()

show_id             0
type                0
title               0
director            0
cast                0
country             0
date_added          0
release_year        0
rating              0
listed_in           0
description         0
duration_cleaned    0
dtype: int64

In [26]:
df.to_csv('netflix_titles_cleaned.csv', index=False)

In [27]:
df = pd.read_csv('netflix_titles_cleaned.csv')
df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,duration_cleaned
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,UNKNOWN,United States,2021-09-25,2020,PG-13,Documentaries,"As her father nears the end of his life, filmm...",90.0


In [28]:
df.isnull().sum()

show_id             0
type                0
title               0
director            0
cast                0
country             0
date_added          0
release_year        0
rating              0
listed_in           0
description         0
duration_cleaned    0
dtype: int64

In [29]:
df.duplicated().sum()   

0

CONVERT DATATYPE FROM CATEGORICAL TO NUMERICAL FOR ML

In [30]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'listed_in', 'description',
       'duration_cleaned'],
      dtype='object')

In [31]:
df['type'].value_counts()

type
Movie      6129
TV Show    2674
Name: count, dtype: int64

In [32]:
df['country'].value_counts().unique().sum()

7609

In [33]:
#use lable encoding to convert the type column to numerical values
df['type'] = df['type'].map({'TV Show': 0, 'Movie': 1})


In [34]:
df['listed_in'].value_counts().unique().sum()

6215

In [35]:
df['rating'].value_counts()

rating
TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: count, dtype: int64

In [36]:
# Drop specific ratings
df = df[~df['rating'].isin(['74 min', '84 min', '66 min'])]

In [37]:
rating_count = df['rating'].value_counts()
less_rated = rating_count[rating_count < 250].index
df['rating'] = df['rating'].replace(less_rated, 'Other')

In [38]:
df['rating'].value_counts()

rating
TV-MA    3207
TV-14    2160
TV-PG     863
R         799
PG-13     490
Other     353
TV-Y7     334
TV-Y      307
PG        287
Name: count, dtype: int64

In [39]:
# Define a mapping for further refined grouping
rating_map = {
    'TV-Y': 'Kids',
    'TV-Y7': 'Kids',
    'TV-Y7-FV': 'Kids',
    'TV-G': 'General Audience',
    'G': 'General Audience',
    'PG': 'Parental Guidance',
    'PG-13': 'Parental Guidance',
    'R': 'Restricted',
    'NC-17': 'Restricted',
    'NR': 'Restricted',
    'UR': 'Restricted',
    'TV-14': 'Mature',
    'TV-MA': 'Mature',
    'Others': 'Others'
}

# First create the rating_grouped column from rating
df['rating_grouped'] = df['rating']

# Then apply the mapping
df['rating_grouped'] = df['rating_grouped'].replace(rating_map)


In [40]:
df['rating_grouped'].value_counts()

rating_grouped
Mature               5367
TV-PG                 863
Restricted            799
Parental Guidance     777
Kids                  641
Other                 353
Name: count, dtype: int64

In [41]:
df.drop(['rating'], axis=1, inplace=True)

In [42]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'listed_in', 'description', 'duration_cleaned',
       'rating_grouped'],
      dtype='object')

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8800 entries, 0 to 8802
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   show_id           8800 non-null   object 
 1   type              8800 non-null   int64  
 2   title             8800 non-null   object 
 3   director          8800 non-null   object 
 4   cast              8800 non-null   object 
 5   country           8800 non-null   object 
 6   date_added        8800 non-null   object 
 7   release_year      8800 non-null   int64  
 8   listed_in         8800 non-null   object 
 9   description       8800 non-null   object 
 10  duration_cleaned  8800 non-null   float64
 11  rating_grouped    8800 non-null   object 
dtypes: float64(1), int64(2), object(9)
memory usage: 893.8+ KB


In [44]:
df['date_added'].head(2)

0    2021-09-25
1    2021-09-24
Name: date_added, dtype: object

In [45]:
# Convert date_added to datetime first
df['date_added'] = pd.to_datetime(df['date_added'])

# Extract year, month and day as integers
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
df['day_added'] = df['date_added'].dt.day

# Drop the original date_added column
df.drop('date_added', axis=1, inplace=True)

In [46]:
df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,release_year,listed_in,description,duration_cleaned,rating_grouped,year_added,month_added,day_added
0,s1,1,Dick Johnson Is Dead,Kirsten Johnson,UNKNOWN,United States,2020,Documentaries,"As her father nears the end of his life, filmm...",90.0,Parental Guidance,2021,9,25
1,s2,0,Blood & Water,UNKNOWN,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",90.0,Mature,2021,9,24


In [47]:
#Extract columns neeed for KNN model
df_knn = df[['type', 'year_added', 'release_year', 'month_added', 'day_added', 'rating_grouped']]

In [48]:
df_knn.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8800 entries, 0 to 8802
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   type            8800 non-null   int64 
 1   year_added      8800 non-null   int32 
 2   release_year    8800 non-null   int64 
 3   month_added     8800 non-null   int32 
 4   day_added       8800 non-null   int32 
 5   rating_grouped  8800 non-null   object
dtypes: int32(3), int64(2), object(1)
memory usage: 378.1+ KB


In [49]:
# Create dummy variables directly from rating_grouped column in df dataframe
rating_dummies = pd.get_dummies(df['rating_grouped'], prefix='rating')

# Create df_knn with the selected columns
df_knn = df[['type', 'year_added', 'release_year', 'month_added', 'day_added']]

# Join the dummy variables
df_knn = pd.concat([df_knn, rating_dummies], axis=1)

In [50]:
# Convert boolean columns to int type (True/False to 1/0)
bool_columns = df_knn.select_dtypes(include=['bool']).columns
df_knn[bool_columns] = df_knn[bool_columns].astype(int)

In [51]:
df_knn.head(1)

Unnamed: 0,type,year_added,release_year,month_added,day_added,rating_Kids,rating_Mature,rating_Other,rating_Parental Guidance,rating_Restricted,rating_TV-PG
0,1,2021,2020,9,25,0,0,0,1,0,0


In [52]:

df_knn.head(10)

Unnamed: 0,type,year_added,release_year,month_added,day_added,rating_Kids,rating_Mature,rating_Other,rating_Parental Guidance,rating_Restricted,rating_TV-PG
0,1,2021,2020,9,25,0,0,0,1,0,0
1,0,2021,2021,9,24,0,1,0,0,0,0
2,0,2021,2021,9,24,0,1,0,0,0,0
3,0,2021,2021,9,24,0,1,0,0,0,0
4,0,2021,2021,9,24,0,1,0,0,0,0
5,0,2021,2021,9,24,0,1,0,0,0,0
6,1,2021,2021,9,24,0,0,0,1,0,0
7,1,2021,1993,9,24,0,1,0,0,0,0
8,0,2021,2021,9,24,0,1,0,0,0,0
9,1,2021,2021,9,24,0,0,0,1,0,0


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Selecting columns to scale
columns_to_scale = ['year_added', 'release_year', 'month_added', 'day_added']
columns_to_leave_unscaled = [col for col in df_knn.columns if col not in columns_to_scale]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(df_knn, df['type'], test_size=0.2, random_state=42)
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Scaling only selected columns
scaler = StandardScaler()
X_train_scaled_part = scaler.fit_transform(X_train[columns_to_scale])
X_eval_scaled_part = scaler.transform(X_eval[columns_to_scale])
X_test_scaled_part = scaler.transform(X_test[columns_to_scale])

# Combine scaled and unscaled columns back together
import numpy as np
import pandas as pd

# Reconstruct DataFrames with scaled and unscaled columns
X_train_scaled = pd.DataFrame(
    np.hstack((X_train[columns_to_leave_unscaled].values, X_train_scaled_part)),
    columns=columns_to_leave_unscaled + columns_to_scale
)

X_eval_scaled = pd.DataFrame(
    np.hstack((X_eval[columns_to_leave_unscaled].values, X_eval_scaled_part)),
    columns=columns_to_leave_unscaled + columns_to_scale
)

X_test_scaled = pd.DataFrame(
    np.hstack((X_test[columns_to_leave_unscaled].values, X_test_scaled_part)),
    columns=columns_to_leave_unscaled + columns_to_scale
)

# Print the final shape for verification
print(f"Train set size: {X_train_scaled.shape}, {y_train.shape}")
print(f"Validation set size: {X_eval_scaled.shape}, {y_eval.shape}")
print(f"Test set size: {X_test_scaled.shape}, {y_test.shape}")


Train set size: (4928, 11), (4928,)
Validation set size: (2112, 11), (2112,)
Test set size: (1760, 11), (1760,)


In [54]:
X_train_scaled.head(1)

Unnamed: 0,type,rating_Kids,rating_Mature,rating_Other,rating_Parental Guidance,rating_Restricted,rating_TV-PG,year_added,release_year,month_added,day_added
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.825016,0.092064,0.980826,-1.160514


In [55]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

k_range = range(1, 21)
accuracies = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_eval_scaled)
    accuracy = accuracy_score(y_eval, y_pred)
    accuracies.append(accuracy)

# Plot using Plotly
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(k_range), y=accuracies, mode='lines+markers', name='Accuracy'))

fig.update_layout(
    title='K-Value vs Evaluation Accuracy',
    xaxis=dict(title='K-Value'),
    yaxis=dict(title='Accuracy'),
    width=800,
    height=500
)
fig.show()


In [56]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import plotly.figure_factory as ff
import plotly.graph_objects as go

# Initialize and train the final KNN model with k=3
final_knn = KNeighborsClassifier(n_neighbors=3)
final_knn.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = final_knn.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy (k=3): {accuracy:.3f}")

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix visualization
conf_matrix = confusion_matrix(y_test, y_pred)
labels = ['Class 0', 'Class 1']  # Adjust labels based on your actual target values

fig = ff.create_annotated_heatmap(
    z=conf_matrix,
    x=labels,
    y=labels,
    colorscale='Blues',
    showscale=True
)

fig.update_layout(
    title='Confusion Matrix for KNN (k=3)',
    xaxis_title='Predicted',
    yaxis_title='Actual'
)
fig.show()


Final Model Accuracy (k=3): 0.995

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       544
           1       0.99      1.00      1.00      1216

    accuracy                           1.00      1760
   macro avg       1.00      0.99      0.99      1760
weighted avg       1.00      1.00      1.00      1760



In [82]:

from sklearn.neighbors import NearestNeighbors

# Recommendation system
input_title = "Breaking Bad"
input_data = df[df['title'].str.contains(input_title, case=False, na=False)].iloc[0]

# Create one-hot encoding for rating_grouped
rating_dummies_input = pd.get_dummies(pd.DataFrame([input_data['rating_grouped']], columns=['rating_grouped']), prefix='rating')
# Ensure all columns exist
for col in ['rating_Kids', 'rating_Mature', 'rating_Other', 'rating_Parental Guidance', 'rating_Restricted', 'rating_TV-PG']:
    if col not in rating_dummies_input.columns:
        rating_dummies_input[col] = 0

# Prepare input features
numeric_features = np.array([[
    input_data['type'],
    input_data['year_added'],
    input_data['release_year'],
    input_data['month_added'],
    input_data['day_added']
]])

# Scale the numeric features
numeric_features_scaled = numeric_features.copy()
numeric_features_scaled[:, [1, 2, 3, 4]] = scaler.transform(numeric_features_scaled[:, [1, 2, 3, 4]])

# Combine numeric and categorical features
input_features_scaled = np.hstack((
    numeric_features_scaled[:, [0]],  # type
    rating_dummies_input,             # rating dummies
    numeric_features_scaled[:, 1:]     # scaled numeric features
))

# Fit NearestNeighbors and find recommendations
nn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
nn_model.fit(X_train_scaled)

# Get recommendations
distances, indices = nn_model.kneighbors(input_features_scaled)
similarities = 1 - distances.flatten()
titles = df.iloc[indices[0]]['title'].values

fig = go.Figure()

# Add bars
fig.add_trace(go.Bar(
    x=titles,
    y=similarities,
    marker=dict(
        color=similarities,
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title='Similarity Score')
    ),
    text=[f'{sim:.3f}' for sim in similarities],
    textposition='auto',
))

# Update layout
fig.update_layout(
    title='KNN: Movie Recommendations Similar to ' + input_title,
    xaxis_title='Movie Titles',
    yaxis_title='Similarity Score',
    showlegend=False,
    height=600,
    xaxis=dict(tickangle=45)  # Rotate x-axis labels for better readability
)

fig.show()

# Print recommendations
print("\nRecommendations for", input_title + ":")
for title, sim in zip(titles[1:], similarities[1:]):  # Skip the first one as it's the input movie
    print(f"{title} (Similarity: {sim:.3f})")


distances, indices = nn_model.kneighbors(input_features_scaled)
recommended_indices = indices[0]
print("\nRecommendations for", input_title + ":")
for idx in recommended_indices:
    print(df.iloc[idx]['title'])


X does not have valid feature names, but StandardScaler was fitted with feature names


X does not have valid feature names, but NearestNeighbors was fitted with feature names




Recommendations for Breaking Bad:
Fantastic Fungi (Similarity: 0.925)
Stay Here (Similarity: 0.925)
Invisible (Similarity: 0.925)
Kaake Da Viyah (Similarity: 0.877)

Recommendations for Breaking Bad:
Slow Country
Fantastic Fungi
Stay Here
Invisible
Kaake Da Viyah



X does not have valid feature names, but NearestNeighbors was fitted with feature names



In [84]:
# Find and drop the row where title contains "Bridgerton - The Afterparty"
df = df[~df['title'].str.contains("Bridgerton - The Afterparty", case=False, na=False)].reset_index(drop=True)

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import pandas as pd
import numpy as np

# Create a more comprehensive content field
df['content'] = (df['title'] + ' ' + 
                df['description'] + ' ' + 
                df['listed_in'] + ' ' + 
                df['rating_grouped'])

# Initialize TF-IDF Vectorizer with improved parameters
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=5000,  # Limit features to most important ones
    ngram_range=(1, 2),  # Include both unigrams and bigrams
    min_df=2  # Ignore terms that appear in less than 2 documents
)

# Fit and transform the content data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['content'])

def get_recommendations_tf(title, top_n=5):
    # Find all matching titles (case-insensitive)
    matches = df[df['title'].str.contains(title, case=False, na=False)]
    
    if matches.empty:
        print(f"No matches found for '{title}'")
        return
    
    # If multiple matches, show them and use the first one
    if len(matches) > 1:
        print("Multiple matches found:")
        for i, t in enumerate(matches['title']):
            print(f"{i+1}. {t}")
        print(f"\nUsing: {matches.iloc[0]['title']}")
    
    # Get the index of the selected movie
    idx = matches.index[0]
    
    # Calculate cosine similarities
    cosine_similarities = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    
    # Get indices of movies ordered by similarity (excluding the input movie)
    similar_indices = cosine_similarities.argsort()[::-1][1:top_n+1]
    
    # Create recommendation DataFrame
    recommendations = pd.DataFrame({
        'Title': df.iloc[similar_indices]['title'],
        'Type': df.iloc[similar_indices]['type'].map({0: 'TV Show', 1: 'Movie'}),
        'Rating': df.iloc[similar_indices]['rating_grouped'],
        'Description': df.iloc[similar_indices]['description'],
        'Release Year': df.iloc[similar_indices]['release_year'],
        'Similarity Score': cosine_similarities[similar_indices]
    })
    
    # Create an interactive bar plot
    fig = px.bar(
        recommendations,
        x='Title',
        y='Similarity Score',
        color='Similarity Score',
        color_continuous_scale='Viridis',
        hover_data=['Type', 'Rating', 'Release Year'],
        title=f'Content-Based Recommendations for "{matches.iloc[0]["title"]}"'
    )
    
    fig.update_layout(
        xaxis_tickangle=45,
        showlegend=False,
        height=600
    )
    
    # Display results
    for _, row in recommendations.iterrows():
        print(f"\nTitle: {row['Title']}")
        print(f"Type: {row['Type']}")
        print(f"Rating: {row['Rating']}")
        print(f"Description: {row['Description']}")
        print(f"Release Year: {row['Release Year']}")
        print(f"Similarity Score: {row['Similarity Score']:.4f}")
        print(f"Similarity Score: {row['Similarity Score']:.4f}")
    
    fig.show()
    
    return recommendations

# Example usage
get_recommendations_tf("Breaking Bad", top_n=5)


Multiple matches found:
1. The Road to El Camino: Behind the Scenes of El Camino: A Breaking Bad Movie
2. El Camino: A Breaking Bad Movie
3. Breaking Bad

Using: The Road to El Camino: Behind the Scenes of El Camino: A Breaking Bad Movie

Title: El Camino: A Breaking Bad Movie
Type: Movie
Rating: Mature
Description: Fugitive Jesse Pinkman attempts to outrun his past. Written and directed by "Breaking Bad" creator Vince Gilligan, starring Aaron Paul.
Release Year: 2019
Similarity Score: 0.5048
Similarity Score: 0.5048

Title: El Viaje: Márama y Rombai
Type: Movie
Rating: TV-PG
Description: Popular South American bands Márama and Rombai share footage of behind-the-scenes antics and discuss their sudden rise to stardom.
Release Year: 2016
Similarity Score: 0.3312
Similarity Score: 0.3312

Title: El desconocido
Type: TV Show
Rating: Mature
Description: Based on real events, the fictional story of Mexican drug lord El Chato's number one hitman, El Cholo.
Release Year: 2019
Similarity Score:

Unnamed: 0,Title,Type,Rating,Description,Release Year,Similarity Score
3427,El Camino: A Breaking Bad Movie,Movie,Mature,Fugitive Jesse Pinkman attempts to outrun his ...,2019,0.504765
6674,El Viaje: Márama y Rombai,Movie,TV-PG,Popular South American bands Márama and Rombai...,2016,0.331191
3747,El desconocido,TV Show,Mature,"Based on real events, the fictional story of M...",2019,0.324437
4212,El Potro: Unstoppable,Movie,Mature,"Argentine cuarteto singer Rodrigo ""El Potro"" B...",2018,0.270595
6670,El Che,Movie,Mature,Spanish-Mexican writer Paco Ignacio Taibo II h...,2017,0.258756


In [None]:
%pip install transformers
%pip install torch
%pip install scikit-learn



In [83]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
import plotly.graph_objects as go
import os
import pickle

class MovieRecommender:
    def __init__(self, model_path='movie_recommender_model'):
        self.model_path = model_path
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.embeddings_matrix = None
        self.movies_df = None

        # Initialize BERT components
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased').to(self.device)
        self.model.eval()  # Set model to evaluation mode

    def get_bert_embeddings(self, texts):
        """Compute BERT embeddings for a batch of texts."""
        inputs = self.tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)  # Use mean pooling of the last hidden state

        return embeddings.cpu().numpy()

    def create_embeddings_matrix(self, df):
        self.movies_df = df
        embeddings_list = []
        batch_size = 32

        for i in range(0, len(df), batch_size):
            batch_texts = [f"{row['title']} {row['description']} {row['listed_in']}" for _, row in df.iloc[i:i+batch_size].iterrows()]
            embeddings = self.get_bert_embeddings(batch_texts)
            embeddings_list.extend(embeddings)

        self.embeddings_matrix = np.array(embeddings_list)
        # Normalize the embeddings for better similarity comparison
        self.embeddings_matrix = normalize(self.embeddings_matrix, axis=1)

    def reduce_dimensionality(self, n_components=256):
        """Reduce dimensionality of embeddings using PCA."""
        pca = PCA(n_components=n_components)
        self.embeddings_matrix = pca.fit_transform(self.embeddings_matrix)

    def save_model(self):
        os.makedirs(self.model_path, exist_ok=True)
        np.save(os.path.join(self.model_path, 'embeddings_matrix.npy'), self.embeddings_matrix)
        self.movies_df.to_pickle(os.path.join(self.model_path, 'movies_df.pkl'))

    def load_model(self):
        try:
            self.embeddings_matrix = np.load(os.path.join(self.model_path, 'embeddings_matrix.npy'))
            self.movies_df = pd.read_pickle(os.path.join(self.model_path, 'movies_df.pkl'))
            return True
        except FileNotFoundError:
            return False

    def get_recommendations(self, title, top_n=5):
        idx = self.movies_df[self.movies_df['title'].str.contains(title, case=False, na=False)].index[0]
        cosine_similarities = cosine_similarity([self.embeddings_matrix[idx]], self.embeddings_matrix).flatten()

        similar_indices = cosine_similarities.argsort()[-top_n-1:-1][::-1]
        recommended_titles = self.movies_df.iloc[similar_indices]['title'].values
        similarity_scores = cosine_similarities[similar_indices]

        fig = go.Figure(data=[
            go.Bar(
                x=recommended_titles,
                y=similarity_scores,
                marker=dict(
                    color=similarity_scores,
                    colorscale='Viridis',
                    showscale=True,
                    colorbar=dict(title='Similarity Score')
                ),
                text=[f'{score:.3f}' for score in similarity_scores],
                textposition='auto',
            )
        ])

        fig.update_layout(
            title=f'Movie Recommendations Similar to "{title}"',
            xaxis_title='Movie Titles',
            yaxis_title='Similarity Score',
            showlegend=False,
            height=600,
            xaxis=dict(tickangle=45)
        )

        return recommended_titles, similarity_scores, fig

# Instantiate and use the updated recommender
recommender = MovieRecommender()

if not recommender.load_model():
    recommender.create_embeddings_matrix(df)
    recommender.reduce_dimensionality(n_components=256)
    recommender.save_model()

titles, scores, fig = recommender.get_recommendations("Breaking Bad", top_n=5)
fig.show()


In [131]:
# def get_recommendations(title, top_n=5):
#     # Get the index of the movie that matches the title
#     idx = df[df['title'].str.contains(title, case=False, na=False)].index[0]

#     # Compute the cosine similarity between the selected movie and all others
#     cosine_similarities = cosine_similarity([embeddings_matrix[idx]], embeddings_matrix).flatten()

#     # Get the indices of the most similar movies (excluding the input movie itself)
#     similar_indices = cosine_similarities.argsort()[-top_n-1:-1][::-1]

#     # Get the recommended titles and their similarity scores
#     recommended_titles = df.iloc[similar_indices]['title'].values
#     similarity_scores = cosine_similarities[similar_indices]
    
#     # Print recommendations
#     print(f"\nRecommendations for '{title}':")
#     for i, (title, score) in enumerate(zip(recommended_titles, similarity_scores), 1):
#         print(f"{i}. {title} (Similarity: {score:.4f})")
    
#     # Create a plotly bar chart
#     fig = go.Figure(data=[
#         go.Bar(
#             x=recommended_titles,
#             y=similarity_scores,
#             text=similarity_scores.round(4),
#             textposition='auto',
#         )
#     ])

#     fig.update_layout(
#         title=f'Content-Based Recommendations for "{title}"',
#         xaxis_title='Recommended Titles',
#         yaxis_title='Cosine Similarity Score',
#         xaxis_tickangle=-45,
#         height=500,
#         showlegend=False
#     )

#     fig.show()
# get_recommendations(title="The Matrix", top_n=5)


Recommendations for 'The Matrix':
1. Saving Capitalism (Similarity: 0.9263)
2. Prague (Similarity: 0.9170)
3. Melodies of Life - Born This Way (Similarity: 0.9151)
4. Anjelah Johnson: Not Fancy (Similarity: 0.9129)
5. What the Health (Similarity: 0.9126)
