In [201]:
from sklearn.feature_extraction.text import TfidfVectorizer # vectorizes the data
from sklearn.metrics.pairwise import cosine_similarity # finds similarity between vectors

import pandas as pd # dataframe library

filename = r"C:\Users\John Kim\Desktop\kdrama_data.csv"

df = pd.read_csv(filename)

df = df[['title', 'description', 'keywords', 'genres', 'actors', 'director', 'screenwriter']]

def fill_na():
    df.replace("N/A", "")
    for label in df.columns:
        df[label] = df[label].fillna('') # fills N/A values with ""

fill_na()
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['keywords'] + " " + df['genres']
 + " " + df['actors'] + " " + df['director'] + " " + df['screenwriter'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df.index, index=df['title'])
indices = indices[~indices.index.duplicated(keep='last')]

def search_kdrama(kdrama_name, kdrama_indices):
    return kdrama_indices[kdrama_indices.index.str.contains(kdrama_name, na=False)]

def get_recommended_kdramas(target_kdrama_index, kdrama_similarities, kdramas_df):
    similarity_scores = pd.DataFrame(kdrama_similarities[target_kdrama_index], columns=["score"])
    kdrama_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index
    return pd.concat([kdramas_df['title'].iloc[kdrama_indices], similarity_scores.iloc[kdrama_indices]], axis = 1)

get_recommended_kdramas(1, cosine_sim, df)
search_kdrama("Move", indices)


title
Move to Heaven                   0
Romantic Movement In Seoul    2503
dtype: int64

In [20]:
tfidf = TfidfVectorizer(stop_words='english')
df['Synopsis'] = df['Synopsis'].fillna('') # null synposis = ""

In [21]:
tfidf_matrix = tfidf.fit_transform(df['Synopsis'])
tfidf_matrix.shape # (100, 1853) means 100 movies, 1853 unique words

(100, 1853)

In [26]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) # calculates similarity between each drama (?)
cosine_sim[0][1] # compares similarity between number 1 movie with number 2 movie

array([1.        , 0.00672245, 0.01857485, 0.        , 0.04389825,
       0.04899857, 0.        , 0.0051677 , 0.00652825, 0.        ,
       0.02324584, 0.        , 0.00660733, 0.02052612, 0.        ,
       0.        , 0.00637876, 0.00541432, 0.01489357, 0.02412528,
       0.02917888, 0.        , 0.        , 0.01836815, 0.01931049,
       0.        , 0.        , 0.00936167, 0.02332473, 0.01842789,
       0.        , 0.        , 0.        , 0.01726747, 0.01194718,
       0.01797403, 0.02007764, 0.00642033, 0.01068586, 0.        ,
       0.0235464 , 0.05099411, 0.        , 0.00723527, 0.        ,
       0.03692231, 0.18003975, 0.01240955, 0.02886854, 0.0282077 ,
       0.01657028, 0.08602724, 0.        , 0.        , 0.        ,
       0.01538312, 0.        , 0.        , 0.01919222, 0.01252548,
       0.        , 0.01655885, 0.01152103, 0.        , 0.        ,
       0.03381299, 0.01628826, 0.00475315, 0.0258285 , 0.        ,
       0.02381831, 0.        , 0.        , 0.01617681, 0.     

In [200]:
indices = pd.Series(df.index, index=df['title'])
indices = indices[~indices.index.duplicated(keep='last')]
indices

title
Move to Heaven            0
Hospital Playlist         1
Flower of Evil            2
Hospital Playlist 2       3
My Mister                 4
                       ... 
Never Give Up          3027
Doctor Lawyer          3028
My Rocket Ship         3029
Gold Mask              3030
Master's Delicacies    3031
Length: 3032, dtype: int64

In [42]:
target_kdrama_index = indices['Move to Heaven'] # lets us get its indices
target_kdrama_index

0

In [43]:
similarity_scores = pd.DataFrame(cosine_sim[target_kdrama_index], columns = ["score"])
similarity_scores

Unnamed: 0,score
0,1.000000
1,0.006722
2,0.018575
3,0.000000
4,0.043898
...,...
95,0.000000
96,0.000000
97,0.008652
98,0.057319


In [45]:
kdrama_indices = similarity_scores.sort_values("score", ascending=False)[0:11].index
df['Name'].iloc[kdrama_indices]

0                    Move to Heaven
46                          Misaeng
82                          Save Me
51                 The Fiery Priest
98    Because This Is My First Life
41          While You Were Sleeping
5                   Prison Playbook
78                       Squid Game
4                         My Mister
93                  Beautiful World
45        Arthdal Chronicles Part 2
Name: Name, dtype: object

In [191]:
def search_kdrama(kdrama_name, kdrama_indices):
    return kdrama_indices[kdrama_indices.index.str.contains(kdrama_name, na=False)]

search_kdrama('to', indices)

title
Move to Heaven                           0
It's Okay to Not Be Okay                 9
Hometown Cha-Cha-Cha                    28
Hot Stove League                        59
Ghost Doctor                            67
                                      ... 
Six Love Story                        1541
My Love: Six Stories of True Love     1566
Hometown Over the Hill 2              1573
Pluto Squad                           1596
The Sensible Life of Director Shin    1597
Length: 97, dtype: int64

In [50]:
def get_recommended_kdramas(target_kdrama_index, kdrama_similarities, kdramas_df):
    similarity_scores = pd.DataFrame(kdrama_similarities[target_kdrama_index], columns=["score"])
    kdrama_indices = similarity_scores.sort_values("score", ascending=False)[0:11].index
    return kdramas_df['Name'].iloc[kdrama_indices]

get_recommended_kdramas(1, cosine_sim, df)

1                   Hospital Playlist
3                 Hospital Playlist 2
43                     Dr. Romantic 2
10                             Signal
66                 Children of Nobody
26    Weightlifting Fairy Kim Bok Joo
96                        Doctor John
6                          Reply 1988
53             Descendants of the Sun
18                             Healer
42                    Dear My Friends
Name: Name, dtype: object