In [2]:
import pandas as pd  # dataframe library
from sklearn.feature_extraction.text import \
    TfidfVectorizer  # vectorizes the data
from sklearn.metrics.pairwise import \
    cosine_similarity  # finds similarity between vectors

# filename = 'csv\kdrama_data.csv'
filename = r"C:\Users\John Kim\Desktop\kdrama_data.csv"

label_weights = {
        "keywords": 0.4,    
        "genres": 0.3,
        "actors": 0.2,
        "director": 0.05,
        "screenwriter": 0.05,
    }

df = pd.read_csv(filename)
df = df[['title', 'description', 'keywords', 'genres', 'actors', 'director', 'screenwriter']]



In [2]:
import pandas as pd  # dataframe library
from sklearn.feature_extraction.text import \
    TfidfVectorizer  # vectorizes the data
from sklearn.metrics.pairwise import \
    cosine_similarity  # finds similarity between vectors

# filename = 'csv\kdrama_data.csv'
filename = r"C:\Users\John Kim\Desktop\kdrama_data.csv"

label_weights = {
        "keywords": 0.4,    
        "genres": 0.3,
        "actors": 0.2,
        "director": 0.05,
        "screenwriter": 0.05,
    }

df = pd.read_csv(filename)
df = df[['title', 'description', 'keywords', 'genres', 'actors', 'director', 'screenwriter']]

def get_titles():
    np_titles = df['title'].to_numpy()
    title_list = np_titles.tolist()
    return title_list

def fill_na():
    """replaces na values with an empty string"""
    df.replace("N/A", "")
    for label in df.columns:
        df[label] = df[label].fillna('') # fills N/A values with ""

def get_indices():
    indices = pd.Series(df.index, index=df['title'])
    return indices[~indices.index.duplicated(keep='last')]

def og_cos_sim():
    """the similarity scores used to get the initial top x kdramas"""
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['keywords'] + " " + df['genres']
    + " " + df['actors'] + " " + df['director'] + " " + df['screenwriter'])
    return cosine_similarity(tfidf_matrix, tfidf_matrix)

def search_kdrama(kdrama_name):
    """searches for kdrama with matching name and returns top result"""
    # return get_indices()[get_indices().index.str.contains(kdrama_name, regex=False, na=False)][0]
    return get_indices()[get_indices().index.str.contains(kdrama_name.lower(), case=False, regex=False, na=False)][0]

def get_recommended_kdramas(target_kdrama_index, kdrama_similarities, kdramas_df, rec_num):
    """returns the top (rec_num) recommended kdramas based on keywords, genres, actors, director, director
    and screenwriter (we recalculate their similarity score by using our own 'weights' :omg:)"""
    if rec_num <= 1:
        # return no kdramas
        return False

    # should set a max on how many recommended kdrama you can get (maybe like 25 or 50?)
    similarity_scores = pd.DataFrame(kdrama_similarities[target_kdrama_index], columns=["score"])
    kdrama_indices = similarity_scores.sort_values("score", ascending=False)[1:rec_num].index # gets top 10 (we can change this)
    return kdramas_df['title'].iloc[kdrama_indices].values # converts to array

def get_score(og_name, rec_name, sim):
    """gets the similarity score of a kdrama based on ONE aspect (e.g. only keywords)"""
    rec_index = search_kdrama(rec_name)
    scores = sim[search_kdrama(og_name)]
    return scores[rec_index]

def vectorize_kdrama(col_name):
    """vectorizes the kdrama based on ONE aspect (e.g. only keywords)"""
    tfidf = TfidfVectorizer(stop_words='english')
    return tfidf.fit_transform(df[col_name])

def find_similarity(matrix):
    """finds the similarity between this matrix's kdrama and everything else"""
    return cosine_similarity(matrix, matrix)

def create_similarity_data(name):
    """creates a dictionary of arrays with the top 10 similar kdramas"""
    similarity_data = {
        "titles": [],
        "keywords": [],
        "genres": [],
        "actors": [],
        "director": [],
        "screenwriter": [],
    }

    target_index = search_kdrama(name)
    top_ten = get_recommended_kdramas(target_index, og_cos_sim(), df, 10)

    for kdrama in top_ten:
        # adds this title to dictionary
        similarity_data["titles"].append(kdrama)

        for label in label_weights.keys():
            vec = vectorize_kdrama(label)
            sim = find_similarity(vec)
            label_score = get_score(name, kdrama, sim) * label_weights[label]
            # print(label_score)
            similarity_data[label].append(label_score)
    
    return similarity_data

def get_top_rec_kdrama(name):
    """reorders the top recommended kdramas and converts to a dataframe"""
    fill_na()
    data = create_similarity_data(name)
    new_df = pd.DataFrame(data)
    new_df['sim_score'] = new_df.sum(axis=1, numeric_only=True)
    new_df = new_df.sort_values("sim_score", ascending=False)
    # print(new_df)
    kdrama_list = new_df['sim_score'].values.tolist()
    # kdrama_list = new_df['titles'].values.tolist()
    return new_df['sim_score']

def get_names(name):
    """reorders the top recommended kdramas and converts to a dataframe"""
    fill_na()
    data = create_similarity_data(name)
    new_df = pd.DataFrame(data)
    new_df['sim_score'] = new_df.sum(axis=1, numeric_only=True)
    new_df = new_df.sort_values("sim_score", ascending=False)
    # print(new_df)
    kdrama_list = new_df['sim_score'].values.tolist()
    return new_df['titles'].values.tolist()
# get_top_rec_kdrama("Move to Heaven")
# search_kdrama("heaven")
kdrama_list = get_names("Move to Heaven")
sim_scores = get_top_rec_kdrama("Move to Heaven")

In [34]:
print(kdrama_list)

['3 Leaf Clover', 'Navillera', 'Good Doctor', 'Rickety Rackety Family', 'The Light in Your Eyes', 'Uncle', 'Panda and Hedgehog', 'Pluto Squad', 'Air City']


In [17]:
sim_scores = sim_scores.reset_index(drop=True)
sim_scores

0    373212.903694
1    339218.961970
2    307060.514558
3    273950.194431
4    273102.298384
5    243545.499227
6    226662.965989
7     87092.374018
8     85393.740714
Name: sim_score, dtype: float64

In [27]:
# sim_scores.loc[:,] *= 100
sim_scores = sim_scores.round(decimals = 1)
sim_scores


0    37.3
1    33.9
2    30.7
3    27.4
4    27.3
5    24.4
6    22.7
7     8.7
8     8.5
Name: sim_score, dtype: float64

In [81]:
df = pd.read_csv(filename)
df = df[['link', 'title', 'rank', 'score']]

# full_df = pd.DataFrame()
for kdrama in kdrama_list:
    if kdrama == kdrama_list[0]:
        full_df = df.loc[df['title'] == kdrama]
        continue
    row = df.loc[df['title'] == kdrama]
    full_df = pd.concat([full_df, row], ignore_index=True)


merged_df = pd.concat([full_df, sim_scores], axis=1, ignore_index=True)
merged_df.columns = ['link', 'title', 'rank', 'score', 'sim_score']
merged_df = merged_df.sort_values("rank", ascending=True)
merged_df = merged_df.reset_index(drop=True)

dicti = merged_df.to_dict()
# print(dicti)
print(merged_df)

                                                link                   title  \
0            https://mydramalist.com/59381-navillera               Navillera   
1           https://mydramalist.com/7184-good-doctor             Good Doctor   
2             https://mydramalist.com/30917-dazzling  The Light in Your Eyes   
3                https://mydramalist.com/60409-uncle                   Uncle   
4    https://mydramalist.com/4518-panda-and-hedgehog      Panda and Hedgehog   
5              https://mydramalist.com/2822-air-city                Air City   
6           https://mydramalist.com/9162-pluto-squad             Pluto Squad   
7          https://mydramalist.com/951-3-leaf-clover           3 Leaf Clover   
8  https://mydramalist.com/17315-rickety-rackety-...  Rickety Rackety Family   

   rank  score  sim_score  
0    13    9.0       33.9  
1   176    8.3       30.7  
2   277    8.2       27.3  
3   329    8.1       24.4  
4  1165    7.2       22.7  
5  1233    7.1        8.5  
6  

In [None]:
full_df = full_df.sort_values("rank", ascending=True)


In [11]:
# for i in range(9):
#     print(dicti['title'][i])

# for i in range(len(dicti['title]))
# for dic in dicti:
#     print(dicti[dic][0])

# print(len(dicti['title']))

for dic in dicti and dic == 'link':
    print(dic)

3 Leaf Clover
2272
7.0
0.37321290369421295
9


In [29]:
thing = {'title': {0: '3 Leaf Clover', 1: 'Navillera', 2: 'Good Doctor', 3: 'Rickety Rackety Family', 4: 'The Light in Your Eyes', 5: 'Uncle', 6: 'Panda and Hedgehog', 7: 'Pluto Squad', 8: 'Air City'}, 'rank': {0: 2272, 1: 13, 2: 176, 3: 2579, 4: 277, 5: 329, 6: 1165, 7: 1596, 8: 1233}, 'score': {0: 7.0, 1: 9.0, 2: 8.3, 3: 4.8, 4: 8.2, 5: 8.1, 6: 7.2, 7: 7.6, 8: 7.1}, 'sim_score': {0: 37.3, 1: 33.9, 2: 30.7, 3: 27.4, 4: 27.3, 5: 24.4, 6: 22.7, 7: 8.7, 8: 8.5}}


{'title': {0: '3 Leaf Clover', 1: 'Navillera', 2: 'Good Doctor', 3: 'Rickety Rackety Family', 4: 'The Light in Your Eyes', 5: 'Uncle', 6: 'Panda and Hedgehog', 7: 'Pluto Squad', 8: 'Air City'}, 'rank': {0: 2272, 1: 13, 2: 176, 3: 2579, 4: 277, 5: 329, 6: 1165, 7: 1596, 8: 1233}, 'score': {0: 7.0, 1: 9.0, 2: 8.3, 3: 4.8, 4: 8.2, 5: 8.1, 6: 7.2, 7: 7.6, 8: 7.1}, 'sim_score': {0: 37.3, 1: 33.9, 2: 30.7, 3: 27.4, 4: 27.3, 5: 24.4, 6: 22.7, 7: 8.7, 8: 8.5}}


In [37]:
df = pd.read_csv(filename)
df = df[['title', 'description', 'keywords', 'genres', 'actors', 'director', 'screenwriter']]


In [73]:
def search_kdrama(kdrama_name):
    """searches for kdrama with matching name and returns top result"""
    # return get_indices()[get_indices().index.str.contains(kdrama_name, regex=False, na=False)][0]
    return get_indices()[get_indices().index.str.contains(kdrama_name.lower(), case=False, regex=False, na=False)][0]

print(search_kdrama("move to heaven"))
def get_info(title):
    df = pd.read_csv(filename)
    fill_na()
    kdrama = search_kdrama(title)
    # print(kdrama)
    row = df.loc[kdrama]
    # return row
    return row.to_dict()
info = get_info("Move to Heaven")
# print(get_info("Move to Heaven"))
print(info)

0
{'link': 'https://mydramalist.com/49231-move-to-heaven', 'rank': 1, 'title': 'Move to Heaven', 'country': 'South Korea', 'description': "Han Geu Roo is a 20-year-old with Autism. He works for his father’s business “Move To Heaven,” a company that specializes in crime scene cleanup, where they also collect and arrange items left by deceased people, and deliver them to the bereaved family.\n\nWhen Geu Roo's father dies, Geu Roo's guardianship passes to his uncle, ex-convict Cho Sang Gu, who is a martial arts fighter in underground matches. Per the father's will,", 'ep': 10, 'genres': "['Life', 'Drama', 'Family']", 'keywords': "['Uncle-Nephew Relationship', 'Autism', 'Death', 'Savant Syndrome', 'Mourning', 'Tearjerker', 'Life Lesson', 'Ex-convict', 'Cleaning and Organizing', 'Asperger’s syndrome']", 'aired': '2021-05-14', 'network': 'Netflix', 'duration': '52 min.', 'content_rating': '18+ Restricted (violence & profanity)', 'score': 9.2, 'num_scored_by': 23705, 'num_watcher': '47,577', 

In [60]:
def remove_chars(string):
    remove_list = ["[", "]", "'"]
    for remove in remove_list:
        string = string.replace(remove, "")
    return string

columns = ['keywords', 'genres', 'actors']

for column in columns:
    info[column] = remove_chars(info[column]) 

# print(info)
print(info['actors'])

Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Jung Suk Yong, Jung Young Joo, Lee Moon Shik


In [72]:
df = pd.read_csv(filename)
df = df.sort_values("rank", ascending=True)
df

Unnamed: 0,link,rank,title,country,description,ep,genres,keywords,aired,network,duration,content_rating,score,num_scored_by,num_watcher,actors,screenwriter,director
0,https://mydramalist.com/49231-move-to-heaven,1,Move to Heaven,South Korea,Han Geu Roo is a 20-year-old with Autism. He w...,10,"['Life', 'Drama', 'Family']","['Uncle-Nephew Relationship', 'Autism', 'Death...",2021-05-14,Netflix,52 min.,18+ Restricted (violence & profanity),9.2,23705,47577,"['Lee Je Hoon', 'Tang Jun Sang', 'Hong Seung H...",Yoon Ji Ryun,Kim Sung Ho
1,https://mydramalist.com/36269-doctor-playbook,2,Hospital Playlist,South Korea,The stories of people going through their days...,12,"['Friendship', 'Romance', 'Life', 'Medical']","['Multiple Mains', 'Best Friends', 'Slow Roman...",2020-03-12,Netflix,1 hr. 30 min.,15+ - Teens 15 or older,9.1,31201,68772,"['Jo Jung Suk', 'Yoo Yeon Seok', 'Jung Kyung H...",Lee Woo Jung,Shin Won Ho
2,https://mydramalist.com/54625-flower-of-evil,3,Flower of Evil,South Korea,Although Baek Hee Sung is hiding a dark secret...,16,"['Thriller', 'Romance', 'Crime', 'Melodrama']","['Married Couple', 'Deception', 'Family Secret...",2020-07-29,tvN,1 hr. 10 min.,15+ - Teens 15 or older,9.1,33764,74068,"['Lee Joon Gi', 'Moon Chae Won', 'Jang Hee Jin...",Yoo Jung Hee,"Kim Chul Gyu, Yoon Jong Ho"
3,https://mydramalist.com/57173-hospital-playlist-2,4,Hospital Playlist 2,South Korea,Everyday is extraordinary for five doctors and...,12,"['Friendship', 'Romance', 'Life', 'Medical']","['Workplace', 'Strong Friendship', 'Best Frien...",2021-06-17,Netflix,1 hr. 40 min.,15+ - Teens 15 or older,9.1,16342,34353,"['Jo Jung Suk', 'Yoo Yeon Seok', 'Jung Kyung H...",Lee Woo Jung,Shin Won Ho
4,https://mydramalist.com/25172-my-ajusshi,5,My Mister,South Korea,Park Dong Hoon is a middle-aged engineer who i...,16,"['Psychological', 'Life', 'Drama', 'Family']","['Nice Male Lead', 'Strong Female Lead', 'Smar...",2018-03-21,tvN,1 hr. 17 min.,15+ - Teens 15 or older,9.1,19660,51665,"['Lee Sun Kyun', 'IU', 'Park Ho San', 'Song Sa...",Park Hae Young,"Kim Won Suk, Kim Sang Woo"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3027,https://mydramalist.com/695713-it-is-not-required,3028,Never Give Up,South Korea,"Depicts the bromance of Goo Pil Soo, a breadwi...",16,"['Comedy', 'Life', 'Drama']","['Businessman Male Lead', 'Entrepreneur Male L...",2022-05-04,Olleh TV,,15+ - Teens 15 or older,7.5,2,864,"['Yoon Doo Joon', 'Kwak Do Won', 'Han Go Eun',...",Son Geun Joo,Choi Do Hoon
3028,https://mydramalist.com/701871-doctor-lawyer,3029,Doctor Lawyer,South Korea,Han Yi Han was an elite surgeon. He graduated...,16,"['Thriller', 'Law', 'Drama', 'Medical']","['Thoracic Surgeon Male Lead', 'Prosecutor Fem...",2022-06-03,MBC,1 hr. 3 min.,15+ - Teens 15 or older,7.9,156,4323,"['So Ji Sub', 'Shin Sung Rok', 'Im Soo Hyang',...",Jang Hong Chul,Lee Yong Seok
3029,https://mydramalist.com/724657-my-rocket-ship,3030,My Rocket Ship,South Korea,"Gong Ji Hoon, CEO of an AI art collection star...",10,"['Business', 'Life', 'Youth', 'Drama']","['Startup', 'Entrepreneur Male Lead', 'Short L...",2022-04-14,Naver TV Cast,16 min.,Not Yet Rated,8.0,2,279,"['Kim Ji Hoon', 'Moon Kang Hyuk', 'Oh Se Young...",,
3030,https://mydramalist.com/724805-golden-mask,3031,Gold Mask,South Korea,"A tragedy occurs for three women, caused by gr...",100,"['Drama', 'Family', 'Melodrama']","['Married Female Lead', 'Loan Shark Female Lea...",2022-05-23,KBS2,35 min.,15+ - Teens 15 or older,6.7,25,374,"['Cha Ye Ryun', 'Lee Hyun Jin', 'Na Young Hee'...",Kim Min Joo,Uh Soo Sun
