**Load Packages and Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import os
from scipy.sparse import coo_matrix

**Data Loading and Preprocessing**


In [None]:
train_data = pd.read_csv('/content/netflix_titles.csv')
train_data.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [None]:
train_data.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
train_data.shape

(8807, 12)

In [None]:
train_data.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,2634
cast,825
country,831
date_added,10
release_year,0
rating,4
duration,3


In [None]:
# Fill missing values in 'director' with a default value (e.g., 0)
train_data['director'].fillna('Unknown', inplace=True)
# Fill missing values in 'cast' with a default value (e.g., 0)
train_data['cast'].fillna('Unknown', inplace=True)
# Fill missing values in 'country' with an empty string
train_data['country'].fillna('Unknown', inplace=True)
# Fill missing values in 'date_added' with a default value (e.g., 'Unknown')
train_data['date_added'].fillna('Unknown', inplace=True)
# Fill missing values in 'rating' with a default value (e.g., 'Unknown')
train_data['rating'].fillna('Unknown', inplace=True)
# Fill missing values in 'duration' with an empty string
train_data['duration'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['country'].fillna('Unknown', inplace=True)


In [None]:
train_data.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,0
cast,0
country,0
date_added,0
release_year,0
rating,0
duration,0


In [None]:
train_data.duplicated().sum()

np.int64(0)

**Data Cleaning and Tags Creations**

In [None]:
train_data['director'] = train_data['director'].fillna('Unknown')
train_data['cast'] = train_data['cast'].fillna('Unknown')
train_data['description'] = train_data['description'].fillna('')
train_data['title'] = train_data['title'].fillna('')

In [None]:
train_data[['director','title','description','cast']] = train_data[['director','title','description','cast']].astype(str)

In [None]:
import pandas as pd

# Columns to create tags from
columns_to_extract_tags_from = ['title', 'cast', 'description', 'director']

# Function to clean text and extract tags
def create_tags(text):
    if pd.isna(text):  # Handle NaN
        return ''
    text = str(text).lower()          # Convert to string and lowercase
    words = text.split()              # Split by spaces
    words = [w.strip('.,!?:;"()') for w in words]  # Remove punctuation
    stop_words = set([
        'the','and','a','an','in','of','for','to','with','on','by','at','from','as','is','it','its','this'
    ])
    tags = [w for w in words if w not in stop_words and len(w)>1]  # Remove stopwords and single letters
    return ', '.join(tags)

# Apply to all columns
for col in columns_to_extract_tags_from:
    train_data[col+'_tags'] = train_data[col].apply(create_tags)


In [None]:
train_data['Tags'] = train_data[columns_to_extract_tags_from].apply(lambda row: ', '.join(row), axis=1)

**Content Base Recommendation system (User Preferences or Items similarities)**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_content = tfidf_vectorizer.fit_transform(train_data['Tags'])
cosine_similarities_content = cosine_similarity(tfidf_matrix_content,tfidf_matrix_content)

In [124]:
item_name = 'Kota Factory'
item_index = train_data[train_data['title'].str.contains(item_name.split()[0].lower(), na=False)].index[0]

In [125]:
similar_items = list(enumerate(cosine_similarities_content[item_index]))


In [126]:
similar_items = sorted(similar_items, key=lambda x:x[1], reverse=True)
top_similar_items = similar_items[1:10]

recommended_items_indics = [x[0] for x in top_similar_items]

In [127]:
train_data.iloc[recommended_items_indics][['title','director','rating','duration']]

Unnamed: 0,title,director,rating,duration
2353,"chaman, bahaar","apurva, dhar, badgaiyann",TV-MA,112 min
8775,"yeh, meri, family",,TV-PG,1 Season
3466,"girls, hostel",,TV-MA,1 Season
2721,duniya,"ramesh, talwar",TV-14,170 min
8124,"super, nani","indra, kumar",TV-PG,127 min
2472,betaal,,TV-MA,1 Season
8665,urvi,"pradeep, verma",TV-MA,121 min
5303,manoranjan,"shammi, kapoor",TV-14,162 min
7932,sangam,"raj, kapoor",TV-14,228 min


**Function To Recommend Products for Content Base**

In [128]:
def content_based_recommendations(train_data, item_name, top_n=10):
    # Check if the item name exists in the training data
    if item_name not in train_data['title'].values:
        print(f"Item '{item_name}' not found in the training data.")
        return pd.DataFrame()

    # Create a TF-IDF vectorizer for item descriptions
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')

    # Apply TF-IDF vectorization to item descriptions
    tfidf_matrix_content = tfidf_vectorizer.fit_transform(train_data['Tags'])

    # Calculate cosine similarity between items based on descriptions
    cosine_similarities_content = cosine_similarity(tfidf_matrix_content, tfidf_matrix_content)

    # Find the index of the item
    item_index = train_data[train_data['title'] == item_name].index[0]

    # Get the cosine similarity scores for the item
    similar_items = list(enumerate(cosine_similarities_content[item_index]))

    # Sort similar items by similarity score in descending order
    similar_items = sorted(similar_items, key=lambda x: x[1], reverse=True)

    # Get the top N most similar items (excluding the item itself)
    top_similar_items = similar_items[1:top_n+1]

    # Get the indices of the top similar items
    recommended_item_indices = [x[0] for x in top_similar_items]

    # Get the details of the top similar items
    recommended_items_details = train_data.iloc[recommended_item_indices][['Name', 'ReviewCount', 'Brand', 'ImageURL', 'Rating']]

    return recommended_items_details