In [14]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
korean_drama = pd.read_csv("./korean_drama.csv")

In [3]:
df = korean_drama.copy()

In [4]:
df.head()

Unnamed: 0,kdrama_id,drama_name,year,director,screenwriter,country,type,tot_eps,duration,start_dt,end_dt,aired_on,org_net,content_rt,synopsis,rank,pop
0,661d4193916c4e71a2c70473ab11e9e8,Sing My Crush,2023,['So Joon Moon'],,South Korea,Drama,8,1500.0,"Aug 2, 2023","Aug 2, 2023",Wednesday,,Not Yet Rated,Follow the story of acquaintances Ba Ram and H...,1484,2238
1,5ffcbeaa17114714af1959129984274c,D.P. Season 2,2023,,['Kim Bo Tong'],South Korea,Drama,6,3000.0,"Jul 28, 2023","Jul 28, 2023",Friday,Netflix,15+ - Teens 15 or older,This unfolding story ensues when military dese...,164,1084
2,65075cb9c1a54be4a441cee6f16c9fdf,Shadow Detective Season 2,2023,['Han Dong Hwa'],"['Song Jung Woo', 'Hwang Seol Hun']",South Korea,Drama,8,3300.0,2023-07-05,2023-07-26,Wednesday,Disney+ Hulu,15+ - Teens 15 or older,Unfolds the ultimate counterattack of veteran ...,2443,6915
3,df0f0ac4b3ff4b15afa26f5a7a53a328,To Be Honest,2023,,,South Korea,Drama,3,600.0,2023-06-30,2023-07-14,Friday,,Not Yet Rated,Don't you have those days where the whole univ...,49895,99999
4,04c1fe41948e464fb440001831d74d41,Celebrity,2023,['Kim Chul Gyu'],['Kim Yi Young'],South Korea,Drama,12,2700.0,"Jun 30, 2023","Jun 30, 2023",Friday,Netflix,18+ Restricted (violence & profanity),Fame. Money. Power. One young woman fights to ...,826,547


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752 entries, 0 to 1751
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   kdrama_id     1752 non-null   object 
 1   drama_name    1752 non-null   object 
 2   year          1752 non-null   int64  
 3   director      1036 non-null   object 
 4   screenwriter  959 non-null    object 
 5   country       1752 non-null   object 
 6   type          1752 non-null   object 
 7   tot_eps       1752 non-null   int64  
 8   duration      1728 non-null   float64
 9   start_dt      1752 non-null   object 
 10  end_dt        1752 non-null   object 
 11  aired_on      1520 non-null   object 
 12  org_net       1344 non-null   object 
 13  content_rt    1752 non-null   object 
 14  synopsis      1584 non-null   object 
 15  rank          1752 non-null   int64  
 16  pop           1752 non-null   int64  
dtypes: float64(1), int64(4), object(12)
memory usage: 232.8+ KB


In [6]:
impute = SimpleImputer(missing_values=np.nan, strategy="constant",
                       fill_value = int(round(df[["duration"]].mean()))
)
df[["duration"]] = pd.DataFrame(impute.fit_transform(df[["duration"]]))

  fill_value = int(round(df[["duration"]].mean()))


In [7]:
df = df[df['synopsis'].notna()]

In [8]:
df = df.drop(columns=["kdrama_id", "director", "screenwriter", "aired_on", "start_dt", "end_dt","org_net"])

In [16]:
encoder = LabelEncoder()
df["content_rt"] = encoder.fit_transform(df["content_rt"])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1584 entries, 0 to 1751
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   drama_name  1584 non-null   object 
 1   year        1584 non-null   int64  
 2   country     1584 non-null   object 
 3   type        1584 non-null   object 
 4   tot_eps     1584 non-null   int64  
 5   duration    1584 non-null   float64
 6   content_rt  1584 non-null   int32  
 7   synopsis    1584 non-null   object 
 8   rank        1584 non-null   int64  
 9   pop         1584 non-null   int64  
dtypes: float64(1), int32(1), int64(4), object(4)
memory usage: 129.9+ KB


In [19]:
df.head(1)

Unnamed: 0,drama_name,year,country,type,tot_eps,duration,content_rt,synopsis,rank,pop
0,Sing My Crush,2023,South Korea,Drama,8,1500.0,4,Follow the story of acquaintances Ba Ram and H...,1484,2238


In [20]:
df["MetaData"] = df["drama_name"] + " " + df["country"] + " " + df["type"] + " " + df["synopsis"] 

In [21]:
vectorizer = TfidfVectorizer(stop_words='english')

In [22]:
tfidf_matrix = vectorizer.fit_transform(df["MetaData"])

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1584 entries, 0 to 1751
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   drama_name  1584 non-null   object 
 1   year        1584 non-null   int64  
 2   country     1584 non-null   object 
 3   type        1584 non-null   object 
 4   tot_eps     1584 non-null   int64  
 5   duration    1584 non-null   float64
 6   content_rt  1584 non-null   int32  
 7   synopsis    1584 non-null   object 
 8   rank        1584 non-null   int64  
 9   pop         1584 non-null   int64  
 10  MetaData    1584 non-null   object 
dtypes: float64(1), int32(1), int64(4), object(5)
memory usage: 142.3+ KB


In [24]:
scaler = MinMaxScaler()

scaled_feat = scaler.fit_transform(df[["year", "tot_eps", "duration", "content_rt", "rank", "pop"]])

In [25]:
cosine_sim = cosine_similarity(np.hstack([tfidf_matrix.toarray(), scaled_feat]))

In [26]:
cosine_sim_df = pd.DataFrame(cosine_sim, index=df["drama_name"], columns=df["drama_name"])

In [27]:
def recommend_movies(movie_title, cosine_sim_df, n_recommendations=2):
    sim_movies = cosine_sim_df[movie_title].sort_values(ascending = False)[1:n_recommendations+1]
    return list(sim_movies.index)

movie_to_recommend = input("Choose a series: ")
recommend_movies(movie_to_recommend, cosine_sim_df)

['Romance by Romance', 'Love Mate']

In [28]:
output_file = "cosine_similarity_matrix.csv"
cosine_sim_df.to_csv(output_file, index=True)

In [None]:
pd.read_csv("./cosine_similarity_matrix.csv")

Unnamed: 0,drama_name,Sing My Crush,D.P. Season 2,Shadow Detective Season 2,To Be Honest,Celebrity,Blue Temperature,Numbers,Revenant,Adult Kids,...,Shine or Go Crazy,My Heart Twinkle Twinkle,Spy,"Persevere, Goo Hae Ra",Heart to Heart,"Kill Me, Heal Me",Enchanting Neighbor,Iron Lady Cha,The Family Is Coming,Crow Building
0,Sing My Crush,1.000000,0.518831,0.508682,0.522614,0.561525,0.600822,0.512217,0.504721,0.601387,...,0.130739,0.322699,0.322947,0.323274,0.138445,0.129238,0.107177,0.113658,0.137149,0.083558
1,D.P. Season 2,0.518831,1.000000,0.611026,0.413806,0.538211,0.476527,0.552275,0.544424,0.478493,...,0.113102,0.161916,0.139126,0.160796,0.111030,0.113530,0.070202,0.073255,0.127972,0.051362
2,Shadow Detective Season 2,0.508682,0.611026,1.000000,0.436055,0.540176,0.481022,0.542718,0.545749,0.483780,...,0.123567,0.165939,0.146044,0.169773,0.115702,0.123110,0.087095,0.092217,0.118046,0.085811
3,To Be Honest,0.522614,0.413806,0.436055,1.000000,0.457819,0.606911,0.415646,0.404863,0.596110,...,0.103155,0.277040,0.278937,0.267986,0.096999,0.091450,0.192976,0.225469,0.124264,0.483179
4,Celebrity,0.561525,0.538211,0.540176,0.457819,1.000000,0.526286,0.540660,0.557427,0.532692,...,0.137212,0.226956,0.213871,0.229496,0.140236,0.127601,0.101434,0.117499,0.124951,0.064716
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1579,"Kill Me, Heal Me",0.129238,0.113530,0.123110,0.091450,0.127601,0.105181,0.132997,0.140801,0.105499,...,0.190725,0.261646,0.210765,0.256414,0.193372,1.000000,0.155088,0.170424,0.204213,0.076977
1580,Enchanting Neighbor,0.107177,0.070202,0.087095,0.192976,0.101434,0.209713,0.106576,0.096818,0.192284,...,0.182362,0.253702,0.232890,0.190074,0.177250,0.155088,1.000000,0.494111,0.177747,0.299308
1581,Iron Lady Cha,0.113658,0.073255,0.092217,0.225469,0.117499,0.206785,0.115359,0.108003,0.229569,...,0.192200,0.244716,0.213765,0.197441,0.211977,0.170424,0.494111,1.000000,0.217271,0.352716
1582,The Family Is Coming,0.137149,0.127972,0.118046,0.124264,0.124951,0.126346,0.125105,0.147527,0.126613,...,0.186446,0.256913,0.222421,0.242377,0.178326,0.204213,0.177747,0.217271,1.000000,0.152222
