In [4]:
import pandas as pd 


df = pd.read_csv("netflix_content.csv")

df.head()

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type
0,The Night Agent: Season 1,Yes,2023-03-23,812100000,English,Show
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000,English,Show
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000,Korean,Show
3,Wednesday: Season 1,Yes,2022-11-23,507700000,English,Show
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000,English,Movie


In [5]:
import pandas as pd


df['Hours Viewed'] = df['Hours Viewed'].astype(str).str.replace(',', '', regex=False).astype('int64')


df.dropna(subset=['Title'], inplace=True)
df.drop_duplicates(subset=['Title'], inplace=True)


df = df.reset_index(drop=True)
df['Content_ID'] = df.index.astype('int32')


df['Language_ID'] = df['Language Indicator'].astype('category').cat.codes.astype('int32')
df['ContentType_ID'] = df['Content Type'].astype('category').cat.codes.astype('int32')


df[['Content_ID', 'Title', 'Hours Viewed', 'Language_ID', 'ContentType_ID']].head()

Unnamed: 0,Content_ID,Title,Hours Viewed,Language_ID,ContentType_ID
0,0,The Night Agent: Season 1,812100000,0,1
1,1,Ginny & Georgia: Season 2,665100000,0,1
2,2,The Glory: Season 1 // 더 글로리: 시즌 1,622800000,3,1
3,3,Wednesday: Season 1,507700000,0,1
4,4,Queen Charlotte: A Bridgerton Story,503000000,0,0


In [13]:
import tensorflow as tf 
from tensorflow.keras import layers, Model 

num_contents = df['Content_ID'].nunique()
num_languages = df['Language_ID'].nunique()
num_types = df['ContentType_ID'].nunique()


content_input = layers.Input(shape=(1,), dtype=tf.int32, name='contentid')
language_input = layers.Input(shape=(1,), dtype=tf.int32, name='languageid')
type_input = layers.Input(shape=(1,), dtype=tf.int32, name='content_type')


content_embedding = layers.Embedding(input_dim=num_contents+1, output_dim=32)(content_input)
language_embedding = layers.Embedding(input_dim=num_languages+1, output_dim=8)(language_input)
type_embedding = layers.Embedding(input_dim=num_types+1, output_dim=16)(type_input)  # FIXED: correct input and input_dim


content_vec = layers.Flatten()(content_embedding)
language_vec = layers.Flatten()(language_embedding)
type_vec = layers.Flatten()(type_embedding)


combined = layers.Concatenate()([content_vec, language_vec, type_vec])
x = layers.Dense(64, activation='relu')(combined)
x = layers.Dropout(0.3)(x) 
x = layers.Dense(32, activation='relu')(x)
x = layers.Dropout(0.2)(x)  
output = layers.Dense(num_contents, activation='softmax')(x)

model = Model(inputs=[content_input, language_input, type_input], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [14]:
model.fit(
    x={
        'contentid': df['Content_ID'],       
        'languageid': df['Language_ID'],      
        'content_type': df['ContentType_ID']  
    },
    y=df['Content_ID'],
    epochs=5,
    batch_size=64
)

Epoch 1/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.0000e+00 - loss: 9.9127
Epoch 2/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.0000e+00 - loss: 9.8702
Epoch 3/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 2.6099e-04 - loss: 9.7663
Epoch 4/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.0013 - loss: 9.1684
Epoch 5/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.0041 - loss: 8.3387


<keras.src.callbacks.history.History at 0x25987d0d810>

In [21]:
import numpy as np

def recommend_similar(content_title, top_k=5):
    # Try exact match first
    matching_content = df[df['Title'] == content_title]
    

    if matching_content.empty:
        matching_content = df[df['Title'].str.contains(content_title, case=False, na=False)]
    
    if matching_content.empty:
        print(f"Content '{content_title}' not found in the dataset.")
        print("\nAvailable titles (first 10):")
        print(df['Title'].head(10).tolist())
        return None
    

    if len(matching_content) > 1:
        print(f"Multiple matches found. Using: '{matching_content.iloc[0]['Title']}'")
    
    content_row = matching_content.iloc[0]
    
    content_id = content_row['Content_ID']
    language_id = content_row['Language_ID']
    content_type_id = content_row['ContentType_ID']
    
    print(f"\nGetting recommendations for: {content_row['Title']}")
    

    predictions = model.predict({
        'contentid': np.array([content_id]),
        'languageid': np.array([language_id]),
        'content_type': np.array([content_type_id])
    }, verbose=0)
    

    top_content_ids = predictions[0].argsort()[-top_k-1:][::-1]
    

    top_content_ids = [cid for cid in top_content_ids if cid != content_id][:top_k]
    

    recommendations = df[df['Content_ID'].isin(top_content_ids)]
    

    recommendations['prediction_score'] = recommendations['Content_ID'].map(
        lambda x: predictions[0][x]
    )
    recommendations = recommendations.sort_values('prediction_score', ascending=False)
    
    return recommendations[['Title', 'Language Indicator', 'Content Type', 'Hours Viewed']]


recommend_similar("Wednesday")  


Multiple matches found. Using: 'Wednesday: Season 1'

Getting recommendations for: Wednesday: Season 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['prediction_score'] = recommendations['Content_ID'].map(


Unnamed: 0,Title,Language Indicator,Content Type,Hours Viewed
7539,Upin&Ipin: Season 6,English,Show,1200000
6681,Come Home: Season 1,English,Show,1500000
2555,Brainchild: Season 1,English,Show,7700000
6798,Sugar Rush Christmas: Season 2,English,Show,1500000
2848,Hitler's Circle of Evil: Season 1,English,Show,6700000
