In [67]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras import Model

In [68]:
data = pd.read_csv("netflix_content.csv")
print(data.head())

                                 Title Available Globally? Release Date  \
0            The Night Agent: Season 1                 Yes   2023-03-23   
1            Ginny & Georgia: Season 2                 Yes   2023-01-05   
2   The Glory: Season 1 // 더 글로리: 시즌 1                 Yes   2022-12-30   
3                  Wednesday: Season 1                 Yes   2022-11-23   
4  Queen Charlotte: A Bridgerton Story                 Yes   2023-05-04   

   Hours Viewed Language Indicator Content Type  
0  81,21,00,000            English         Show  
1  66,51,00,000            English         Show  
2  62,28,00,000             Korean         Show  
3  50,77,00,000            English         Show  
4  50,30,00,000            English        Movie  


In [69]:
data["Hours Viewed"] = data['Hours Viewed'].str.replace(',','', regex=False).astype('int64')

In [70]:
print(data.head())

                                 Title Available Globally? Release Date  \
0            The Night Agent: Season 1                 Yes   2023-03-23   
1            Ginny & Georgia: Season 2                 Yes   2023-01-05   
2   The Glory: Season 1 // 더 글로리: 시즌 1                 Yes   2022-12-30   
3                  Wednesday: Season 1                 Yes   2022-11-23   
4  Queen Charlotte: A Bridgerton Story                 Yes   2023-05-04   

   Hours Viewed Language Indicator Content Type  
0     812100000            English         Show  
1     665100000            English         Show  
2     622800000             Korean         Show  
3     507700000            English         Show  
4     503000000            English        Movie  


In [71]:
data.isnull().sum()

Title                      0
Available Globally?        0
Release Date           16646
Hours Viewed               0
Language Indicator         0
Content Type               0
dtype: int64

In [72]:
data.duplicated().sum()

np.int64(467)

In [73]:
data.drop_duplicates(subset=['Title'], inplace=True)

In [74]:
data['Content_ID'] = data.reset_index().index.astype('int32')

In [75]:
print(data.head())

                                 Title Available Globally? Release Date  \
0            The Night Agent: Season 1                 Yes   2023-03-23   
1            Ginny & Georgia: Season 2                 Yes   2023-01-05   
2   The Glory: Season 1 // 더 글로리: 시즌 1                 Yes   2022-12-30   
3                  Wednesday: Season 1                 Yes   2022-11-23   
4  Queen Charlotte: A Bridgerton Story                 Yes   2023-05-04   

   Hours Viewed Language Indicator Content Type  Content_ID  
0     812100000            English         Show           0  
1     665100000            English         Show           1  
2     622800000             Korean         Show           2  
3     507700000            English         Show           3  
4     503000000            English        Movie           4  


In [76]:
data.head()

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type,Content_ID
0,The Night Agent: Season 1,Yes,2023-03-23,812100000,English,Show,0
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000,English,Show,1
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000,Korean,Show,2
3,Wednesday: Season 1,Yes,2022-11-23,507700000,English,Show,3
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000,English,Movie,4


In [77]:
Le = LabelEncoder()
data['Language_ID'] = Le.fit_transform(data['Language Indicator'])
data['type_ID'] = Le.fit_transform(data['Content Type'])

In [78]:
data.head()

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type,Content_ID,Language_ID,type_ID
0,The Night Agent: Season 1,Yes,2023-03-23,812100000,English,Show,0,0,1
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000,English,Show,1,0,1
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000,Korean,Show,2,3,1
3,Wednesday: Season 1,Yes,2022-11-23,507700000,English,Show,3,0,1
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000,English,Movie,4,0,0


In [79]:
num_contents = data['Content_ID'].nunique()
num_languages = data['Language_ID'].nunique()
num_types = data['type_ID'].nunique()

content_input = tf.keras.layers.Input(shape=(1,), dtype = tf.int32, name='content_id')
language_input = tf.keras.layers.Input(shape=(1,), dtype = tf.int32, name='language_id')
type_input = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name='type_input')

content_emb = tf.keras.layers.Embedding(input_dim=num_contents+1, output_dim=32)(content_input)
language_emb = tf.keras.layers.Embedding(input_dim=num_languages+1, output_dim=32)(language_input)
type_emb = tf.keras.layers.Embedding(input_dim=num_types+1, output_dim=32)(type_input)

content_vec = tf.keras.layers.Flatten()(content_emb)
language_vec = tf.keras.layers.Flatten()(language_emb)
type_vec = tf.keras.layers.Flatten()(type_emb)

combined = tf.keras.layers.Concatenate()([content_vec, language_vec, type_vec])

x = tf.keras.layers.Dense(64, activation='relu')(combined)
x = tf.keras.layers.Dense(32, activation='relu')(x)
output = tf.keras.layers.Dense(num_contents, activation='softmax')(x)

model = Model(inputs=[content_input, language_input, type_input], outputs=[output])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [80]:
history = model.fit(
    x={
        'content_id': data['Content_ID'],
        'language_id': data['Language_ID'],
        'type_input': data['type_ID']
    },
    y=data['Content_ID'],
    epochs=5,
    batch_size=64
)

Epoch 1/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 31ms/step - accuracy: 0.0000e+00 - loss: 9.8788
Epoch 2/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - accuracy: 0.0000e+00 - loss: 9.8643
Epoch 3/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - accuracy: 3.1204e-04 - loss: 9.5545
Epoch 4/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 29ms/step - accuracy: 3.2021e-04 - loss: 9.0601
Epoch 5/5
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - accuracy: 0.0024 - loss: 8.2722


In [86]:
import numpy as np 
def recommend_similar(content_title, top_k=5):
    content_row = data[data['Title'].str.contains(content_title, case=False, na=False)].iloc[0]
    content_id = content_row['Content_ID']
    language_id = content_row['Language_ID']
    type_id = content_row['type_ID']

    predictions = model.predict({
        'content_id': np.array([content_id]),
        'language_id': np.array([language_id]),
        'type_input': np.array([type_id])
    })


    top_indices = predictions[0].argsort()[-top_k-1:][::-1]
    recommendations = data[data['Content_ID'].isin(top_indices)]
    return recommendations[['Title', 'Language Indicator', 'Content Type', 'Hours Viewed']]

recommend_similar("The Night Agent: Season 1")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step


Unnamed: 0,Title,Language Indicator,Content Type,Hours Viewed
0,The Night Agent: Season 1,English,Show,812100000
1095,Jurassic World Camp Cretaceous: Season 2,English,Show,19600000
3670,13 Commandments: Season 1 // 13 Geboden: Seizo...,English,Show,4800000
6504,Chico Bon Bon: Monkey with a Tool Belt: Season 2,English,Show,1600000
9755,The Adventures of Tintin: Season 2 // Les aven...,English,Show,600000
13265,Jurassic World: Double Trouble: Season 1,English,Show,200000
