In [2]:
import pandas as pd  # dataframe library
from sklearn.feature_extraction.text import \
    TfidfVectorizer  # vectorizes the data
from sklearn.metrics.pairwise import \
    cosine_similarity  # finds similarity between vectors

# filename = 'csv\kdrama_data.csv'
filename = r"C:\Users\John Kim\Desktop\kdrama_data.csv"

label_weights = {
        "keywords": 0.4,    
        "genres": 0.3,
        "actors": 0.2,
        "director": 0.05,
        "screenwriter": 0.05,
    }

df = pd.read_csv(filename)
df = df[['title', 'description', 'keywords', 'genres', 'actors', 'director', 'screenwriter']]



In [93]:
import pandas as pd  # dataframe library
from sklearn.feature_extraction.text import \
    TfidfVectorizer  # vectorizes the data
from sklearn.metrics.pairwise import \
    cosine_similarity  # finds similarity between vectors

# filename = 'csv\kdrama_data.csv'
filename = r"C:\Users\John Kim\Desktop\kdrama_data.csv"

label_weights = {
        "keywords": 0.4,    
        "genres": 0.3,
        "actors": 0.2,
        "director": 0.05,
        "screenwriter": 0.05,
    }

df = pd.read_csv(filename)
df = df[['title', 'description', 'keywords', 'genres', 'actors', 'director', 'screenwriter']]

def get_titles():
    np_titles = df['title'].to_numpy()
    title_list = np_titles.tolist()
    return title_list

def fill_na():
    """replaces na values with an empty string"""
    df.replace("N/A", "")
    for label in df.columns:
        df[label] = df[label].fillna('') # fills N/A values with ""

def get_indices():
    indices = pd.Series(df.index, index=df['title'])
    return indices[~indices.index.duplicated(keep='last')]

def og_cos_sim():
    """the similarity scores used to get the initial top x kdramas"""
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['keywords'] + " " + df['genres']
    + " " + df['actors'] + " " + df['director'] + " " + df['screenwriter'])
    return cosine_similarity(tfidf_matrix, tfidf_matrix)

def search_kdrama(kdrama_name):
    """searches for kdrama with matching name and returns top result"""
    # return get_indices()[get_indices().index.str.contains(kdrama_name, regex=False, na=False)][0]
    return get_indices()[get_indices().index.str.contains(kdrama_name.lower(), case=False, regex=False, na=False)][0]

def get_recommended_kdramas(target_kdrama_index, kdrama_similarities, kdramas_df, rec_num):
    """returns the top (rec_num) recommended kdramas based on keywords, genres, actors, director, director
    and screenwriter (we recalculate their similarity score by using our own 'weights' :omg:)"""
    if rec_num <= 1:
        # return no kdramas
        return False

    # should set a max on how many recommended kdrama you can get (maybe like 25 or 50?)
    similarity_scores = pd.DataFrame(kdrama_similarities[target_kdrama_index], columns=["score"])
    kdrama_indices = similarity_scores.sort_values("score", ascending=False)[1:rec_num].index # gets top 10 (we can change this)
    return kdramas_df['title'].iloc[kdrama_indices].values # converts to array

def get_score(og_name, rec_name, sim):
    """gets the similarity score of a kdrama based on ONE aspect (e.g. only keywords)"""
    rec_index = search_kdrama(rec_name)
    scores = sim[search_kdrama(og_name)]
    return scores[rec_index]

def vectorize_kdrama(col_name):
    """vectorizes the kdrama based on ONE aspect (e.g. only keywords)"""
    tfidf = TfidfVectorizer(stop_words='english')
    return tfidf.fit_transform(df[col_name])

def find_similarity(matrix):
    """finds the similarity between this matrix's kdrama and everything else"""
    return cosine_similarity(matrix, matrix)

def create_similarity_data(name):
    """creates a dictionary of arrays with the top 10 similar kdramas"""
    similarity_data = {
        "titles": [],
        "keywords": [],
        "genres": [],
        "actors": [],
        "director": [],
        "screenwriter": [],
    }

    target_index = search_kdrama(name)
    top_ten = get_recommended_kdramas(target_index, og_cos_sim(), df, 10).tolist()

    for label in label_weights.keys():
        vec = vectorize_kdrama(label)
        sim = find_similarity(vec)

        k_list = get_recommended_kdramas(target_index, sim, df, 10)


        for kdrama in k_list:
            if kdrama not in top_ten: top_ten.append(kdrama)

    print(top_ten)

    for kdrama in top_ten:
        # adds this title to dictionary
        similarity_data["titles"].append(kdrama)

        for label in label_weights.keys():
            vec = vectorize_kdrama(label)
            sim = find_similarity(vec)
            label_score = get_score(name, kdrama, sim) * label_weights[label]
            # print(label_score)
            similarity_data[label].append(label_score)
    
    return similarity_data

def get_top_rec_kdrama(name):
    """reorders the top recommended kdramas and converts to a dataframe"""
    fill_na()
    data = create_similarity_data(name)
    new_df = pd.DataFrame(data)
    new_df['sim_score'] = new_df.sum(axis=1, numeric_only=True)
    new_df = new_df.sort_values("sim_score", ascending=False)
    # print(new_df)
    # kdrama_list = new_df['sim_score'].values.tolist()
    # kdrama_list = new_df['titles'].values.tolist()
    return new_df

def get_names(name):
    """reorders the top recommended kdramas and converts to a dataframe"""
    fill_na()
    data = create_similarity_data(name)
    new_df = pd.DataFrame(data)
    new_df['sim_score'] = new_df.sum(axis=1, numeric_only=True)
    new_df = new_df.sort_values("sim_score", ascending=False)
    # print(new_df)
    kdrama_list = new_df['sim_score'].values.tolist()
    return new_df['titles'].values.tolist()
thing = get_top_rec_kdrama("Move to Heaven")
# search_kdrama("heaven")
# kdrama_list = get_names("Move to Heaven")
# sim_scores = get_top_rec_kdrama("Move to Heaven")

['Good Doctor', 'Rickety Rackety Family', 'Uncle', 'Pluto Squad', 'Panda and Hedgehog', '3 Leaf Clover', 'Navillera', 'Air City', 'The Light in Your Eyes', 'Miss Ripley', 'When I Was the Most Beautiful', 'Medical Brothers', 'Time', 'Loveholic', 'My Wife’s Having an Affair this Week', 'My Unfamiliar Family', 'Everybody Say Kungdari', "Heaven's Garden", 'Roses and Bean Sprout', "Let's Get Married", 'Our Blues', 'Country Princess', 'Hometown Over the Hill', 'Racket Boys', 'Here He Comes', 'King2Hearts', 'The Colors of Our Time', 'I Hear Your Voice', 'Hyena', 'Top Management', 'Save Me', 'Secrets and Lies', 'Kingdom', 'Money Game', 'Can You Hear My Heart', 'Thirty-Nine', 'Legend of Hyang Dan', 'The Secret Lovers', 'Angel Eyes', 'Boys Over Flowers', 'Operation Proposal', 'Start-Up', 'The End of the World', 'Gangnam Scandal', 'While You Were Sleeping', 'Pinocchio', 'Dream High']


In [91]:
print(get_recommended_kdramas(0, og_cos_sim(), df, 10).tolist())

['Good Doctor', 'Rickety Rackety Family', 'Uncle', 'Pluto Squad', 'Panda and Hedgehog', '3 Leaf Clover', 'Navillera', 'Air City', 'The Light in Your Eyes']


In [94]:
# for each vectorizer (column label), get the top 10 kdrama
# add the top 10 into an array, if array already contains kdrama, skip
# for each of those titles, get score, return top 10 sim score
print(type(thing))
print(len(thing))
thing

<class 'pandas.core.frame.DataFrame'>
47


Unnamed: 0,titles,keywords,genres,actors,director,screenwriter,sim_score
5,3 Leaf Clover,0.065633,0.286191,0.021389,0.0,0.0,0.373213
16,Everybody Say Kungdari,0.00939,0.3,0.032999,0.0076,0.014616,0.364605
15,My Unfamiliar Family,0.01844,0.3,0.023105,0.0,0.0,0.341545
6,Navillera,0.042971,0.23458,0.061668,0.0,0.0,0.339219
17,Heaven's Garden,0.013492,0.3,0.018921,0.0,0.0,0.332413
20,Our Blues,0.0,0.286191,0.029618,0.004945,0.0,0.320754
18,Roses and Bean Sprout,0.0,0.3,0.007674,0.0,0.0,0.307674
0,Good Doctor,0.121485,0.145018,0.034254,0.006304,0.0,0.307061
22,Hometown Over the Hill,0.0,0.286191,0.01028,0.0,0.006465,0.302935
21,Country Princess,0.0,0.286191,0.0,0.007708,0.0,0.293899


In [41]:
import pandas as pd  # dataframe library
from sklearn.feature_extraction.text import \
    TfidfVectorizer  # vectorizes the data
from sklearn.metrics.pairwise import \
    cosine_similarity  # finds similarity between vectors

# filename = 'csv\kdrama_data.csv'
filename = r"C:\Users\John Kim\Desktop\kdrama_data.csv"

label_weights = {
        "keywords": 0.4,    
        "genres": 0.3,
        "actors": 0.2,
        "director": 0.05,
        "screenwriter": 0.05,
    }

df = pd.read_csv(filename)
df = df[['title', 'description', 'keywords', 'genres', 'actors', 'director', 'screenwriter']]

def remove_chars(string):
    if isinstance(string, str) == False:
        return string
    remove_list = ["[", "]", "'"]
    for remove in remove_list:
        string = string.replace(remove, "")
    return string

def get_info(title):
    df = pd.read_csv(filename)
    fill_na()
    index = search_kdrama(title)
    
    row = df.loc[index]
    dicti = row.to_dict()
    # removes unnecessary characters
    columns = ['keywords', 'genres', 'actors']
    for column in columns:
        dicti[column] = remove_chars(dicti[column]) 
    return dicti

def get_titles():
    np_titles = df['title'].to_numpy()
    title_list = np_titles.tolist()
    return title_list

def kdrama_exists(title, klist):
    for kdrama in klist:
        if title.lower() == kdrama.lower(): return True
    return False
    
def fill_na():
    """replaces na values with an empty string"""
    df.replace("N/A", "")
    for label in df.columns:
        df[label] = df[label].fillna('') # fills N/A values with ""

def get_indices():
    indices = pd.Series(df.index, index=df['title'])
    return indices[~indices.index.duplicated(keep='last')]

def og_cos_sim():
    """the similarity scores used to get the initial top x kdramas"""
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['keywords'] + " " + df['genres']
    + " " + df['actors'] + " " + df['director'] + " " + df['screenwriter'])
    return cosine_similarity(tfidf_matrix, tfidf_matrix)

def search_kdrama(kdrama_name):
    """searches for kdrama with matching name and returns top result"""
    # return get_indices()[get_indices().index.str.contains(kdrama_name, regex=False, na=False)][0]
    return get_indices()[get_indices().index.str.contains(kdrama_name.lower(), case=False, regex=False, na=False)][0]



def get_recommended_kdramas(target_kdrama_index, kdrama_similarities, kdramas_df, rec_num):
    """returns the top (rec_num) recommended kdramas based on keywords, genres, actors, director, director
    and screenwriter (we recalculate their similarity score by using our own 'weights' :omg:)"""

    # should set a max on how many recommended kdrama you can get (maybe like 25 or 50?)
    similarity_scores = pd.DataFrame(kdrama_similarities[target_kdrama_index], columns=["score"])
    kdrama_indices = similarity_scores.sort_values("score", ascending=False)[1:rec_num+1].index # gets top 10 (we can change this)
    return kdramas_df['title'].iloc[kdrama_indices].values # converts to array

def get_score(og_name, rec_name, sim):
    """gets the similarity score of a kdrama based on ONE aspect (e.g. only keywords)"""
    rec_index = search_kdrama(rec_name)
    scores = sim[search_kdrama(og_name)]
    return scores[rec_index]

def vectorize_kdrama(col_name):
    """vectorizes the kdrama based on ONE aspect (e.g. only keywords)"""
    tfidf = TfidfVectorizer(stop_words='english')
    return tfidf.fit_transform(df[col_name])

def find_similarity(matrix):
    """finds the similarity between this matrix's kdrama and everything else"""
    return cosine_similarity(matrix, matrix)

def create_similarity_data(name, rec_num):
    """creates a dictionary of arrays with the top 10 similar kdramas"""
    similarity_data = {
        "titles": [],
        "keywords": [],
        "genres": [],
        "actors": [],
        "director": [],
        "screenwriter": [],
    }

    target_index = search_kdrama(name)
    top_ten = get_recommended_kdramas(target_index, og_cos_sim(), df, rec_num).tolist()

    # gets the top 10 for each category


    for label in label_weights.keys():
        if label == "actors": break
        vec = vectorize_kdrama(label)
        sim = find_similarity(vec)
        k_list = get_recommended_kdramas(target_index, sim, df, rec_num)

        for kdrama in k_list:
            if kdrama not in top_ten: top_ten.append(kdrama)


    # for every kdrama, calculate weighted score and add to dictionary
    for kdrama in top_ten:
        # adds this title to dictionary
        similarity_data["titles"].append(kdrama)

        for label in label_weights.keys():
            vec = vectorize_kdrama(label)
            sim = find_similarity(vec)
            label_score = get_score(name, kdrama, sim) * label_weights[label]
            # print(label_score)
            similarity_data[label].append(label_score)
    
    return similarity_data

def get_top_rec_kdrama(name, sort_label, rec_num):
    """reorders the top recommended kdramas and converts to a dataframe"""
    fill_na()

    # if rec_num.isdigit() == False: rec_num = 10
    rec_num = int(rec_num)
    if (rec_num < 5): rec_num = 5
    if (rec_num > 20): rec_num = 20



    data = create_similarity_data(name, rec_num)
    new_df = pd.DataFrame(data)
    new_df['sim_score'] = new_df.sum(axis=1, numeric_only=True)
    new_df = new_df.sort_values("sim_score", ascending=False)
    # print(new_df)
    kdrama_list = new_df['titles'].values.tolist()
    # kdrama_list = kdrama_list[:10]

    # singles out similarity scores, convert to percent and round to 1dp (e.g. 34.3%)
    sim_scores = new_df['sim_score'].reset_index(drop=True)
    sim_scores.loc[:,] *= 100
    sim_scores = sim_scores.round(decimals = 1)

    return new_df

    # sim_scores = sim_scores.iloc[:10]
    # print(kdrama_list)
    # print(sim_scores)

    # df = pd.read_csv(filename)
    # fill_na()
    # df = df[['link', 'title', 'rank', 'score']]

    # for kdrama in kdrama_list:
    #     if kdrama == kdrama_list[0]:
    #         full_df = df.loc[df['title'] == kdrama]
    #         continue
    #     row = df.loc[df['title'] == kdrama]
    #     full_df = pd.concat([full_df, row], ignore_index=True)

    # return full_df

    # merged_df = pd.concat([full_df, sim_scores], axis=1, ignore_index=True)
    # merged_df.columns = ['link', 'title', 'rank', 'score', 'sim score']

    # if sort_label == "rank":
    #     merged_df = merged_df.sort_values(sort_label, ascending=True)
    # else: merged_df = merged_df.sort_values(sort_label, ascending=False)
    # merged_df = merged_df.reset_index(drop=True)
    # merged_df = merged_df.iloc[:10]

    # dicti = merged_df.to_dict()
    # return dicti

# thing = get_top_rec_kdrama("Move to Heaven", "rank", 10)
# get_top_rec_kdrama("Move to Heaven")
# search_kdrama("heaven")


def get_top(name):
    """reorders the top recommended kdramas and converts to a dataframe"""
    fill_na()

    titles_recs = []

    data = create_similarity_data(name, 20)
    new_df = pd.DataFrame(data)
    new_df['sim_score'] = new_df.sum(axis=1, numeric_only=True)
    new_df = new_df.sort_values("sim_score", ascending=False)
    # print(new_df)
    kdrama_list = new_df['titles'].values.tolist()
    # kdrama_list = kdrama_list[:10]

    # singles out similarity scores, convert to percent and round to 1dp (e.g. 34.3%)
    sim_scores = new_df['sim_score'].reset_index(drop=True)
    sim_scores.loc[:,] *= 100
    sim_scores = sim_scores.round(decimals = 1)
    sim_list = sim_scores.tolist()

    if (sim_list[0] == 100):
        titles_recs.append(kdrama_list[1:21])
        titles_recs.append(sim_list[1:21])
    else:
        titles_recs.append(kdrama_list[:20])
        titles_recs.append(sim_list[:20])

    return titles_recs
    # new_df.drop("keywords", axis=1, inplace=True)
    # new_df.drop("genres", axis=1, inplace=True)
    # new_df.drop("actors", axis=1, inplace=True)
    # new_df.drop("director", axis=1, inplace=True)
    # new_df.drop("screenwriter", axis=1, inplace=True)
    
    # new_df = new_df.reset_index(drop=True)
    # new_df = new_df.iloc[:20]
    # dic = new_df.to_dict()
    # return dic

dic = get_top("Move to Heaven")


In [44]:
lists = dic
print(lists)

print(len(lists[1]))

[['3 Leaf Clover', 'My Healing Love', 'Everybody Say Kungdari', 'My Unfamiliar Family', 'Navillera', "Heaven's Garden", 'Dear My Friends', 'Can We Get Married?', 'Love (ft. Marriage and Divorce) 3', 'When My Love Blooms', 'Our Blues', 'Love (ft. Marriage and Divorce) 2', "It's Okay, Daddy's Girl", 'Roses and Bean Sprout', 'Good Doctor', 'Victory For Tomorrow', 'Hometown Over the Hill', 'Hometown Over the Hill 2', "I'll Show You the Taste", 'Valid Love'], [37.3, 37.0, 36.5, 34.2, 33.9, 33.2, 33.0, 33.0, 32.3, 32.3, 32.1, 31.8, 31.6, 30.8, 30.7, 30.3, 30.3, 30.3, 30.0, 29.9]]
20


In [50]:
import copy

import pandas as pd

ktitles = get_titles()

k_recs = {
    "title": "N/A",
    "recommendations": "N/A",
    "similarity": "N/A",
}

# list of dictionaries
kdata = []

# for each title, get top 20 recommended kdrama + similarity score
def store_data():
    # for every kdrama, get their recs + sim scores
    for title in ktitles:
        # 20 kdrama recs + their sim score
        if count == 2: break
        
        recs_score = get_top(title)

        row = copy.copy(k_recs)
        
        row['title'] = title
        row['recommendations'] = recs_score[0]
        row["similarity"] = recs_score[1]

        kdata.append(row)

def merge_data():
    store_data()
    # dataframe with all recs + sim score
    rec_df = pd.DataFrame(kdata)
    
    # dataframe with link, title, rank, score of all kdrama
    df = pd.read_csv(filename)
    fill_na()
    df = df[['link', 'title', 'rank', 'score']]

    # merged dataframe of the two
    merged_df = pd.merge(df, rec_df)
    merged_df.to_csv('recs.csv', encoding='utf-8', index=False)

merge_data()

# print(ktitles)

# convert array of dic to df
# get columns from this df and add to the kdrama df


[{'title': 'Move to Heaven', 'recommendations': ['3 Leaf Clover', 'My Healing Love', 'Everybody Say Kungdari', 'My Unfamiliar Family', 'Navillera', "Heaven's Garden", 'Dear My Friends', 'Can We Get Married?', 'Love (ft. Marriage and Divorce) 3', 'When My Love Blooms', 'Our Blues', 'Love (ft. Marriage and Divorce) 2', "It's Okay, Daddy's Girl", 'Roses and Bean Sprout', 'Good Doctor', 'Victory For Tomorrow', 'Hometown Over the Hill', 'Hometown Over the Hill 2', "I'll Show You the Taste", 'Valid Love'], 'similarity': [37.3, 37.0, 36.5, 34.2, 33.9, 33.2, 33.0, 33.0, 32.3, 32.3, 32.1, 31.8, 31.6, 30.8, 30.7, 30.3, 30.3, 30.3, 30.0, 29.9]}, {'title': 'Hospital Playlist', 'recommendations': ['Hospital Playlist', 'Hospital Playlist 2', 'A Poem a Day', 'Thirty-Nine', 'Flower Ever After', 'Soulmate', 'Be Melodramatic', 'Fight Hard, Love Harder: Season 2', 'Marry Me Now', 'Reply 1988', 'Eulachacha Waikiki 2', 'Lovestruck in the City', 'A-Teen', 'Anniversary Anyway', 'Number Six', 'Just Friend', '

In [51]:
rec_df = pd.DataFrame(kdata)
rec_df

Unnamed: 0,title,recommendations,similarity
0,Move to Heaven,"[3 Leaf Clover, My Healing Love, Everybody Say...","[37.3, 37.0, 36.5, 34.2, 33.9, 33.2, 33.0, 33...."
1,Hospital Playlist,"[Hospital Playlist, Hospital Playlist 2, A Poe...","[100.0, 77.9, 41.7, 38.4, 36.6, 36.2, 35.4, 35..."


In [54]:
two_df = df.head(2)
two_df

Unnamed: 0,title,description,keywords,genres,actors,director,screenwriter
0,Move to Heaven,Han Geu Roo is a 20-year-old with Autism. He w...,"['Uncle-Nephew Relationship', 'Autism', 'Death...","['Life', 'Drama', 'Family']","['Lee Je Hoon', 'Tang Jun Sang', 'Hong Seung H...",Kim Sung Ho,Yoon Ji Ryun
1,Hospital Playlist,The stories of people going through their days...,"['Multiple Mains', 'Best Friends', 'Slow Roman...","['Friendship', 'Romance', 'Life', 'Medical']","['Jo Jung Suk', 'Yoo Yeon Seok', 'Jung Kyung H...",Shin Won Ho,Lee Woo Jung


In [62]:
# two_df.add(rec_df.recommendations, axis='index')
# two_df

concat = pd.concat([two_df, rec_df], axis=1, join="outer")
concat



Unnamed: 0,title,description,keywords,genres,actors,director,screenwriter,title.1,recommendations,similarity
0,Move to Heaven,Han Geu Roo is a 20-year-old with Autism. He w...,"['Uncle-Nephew Relationship', 'Autism', 'Death...","['Life', 'Drama', 'Family']","['Lee Je Hoon', 'Tang Jun Sang', 'Hong Seung H...",Kim Sung Ho,Yoon Ji Ryun,Move to Heaven,"[3 Leaf Clover, My Healing Love, Everybody Say...","[37.3, 37.0, 36.5, 34.2, 33.9, 33.2, 33.0, 33...."
1,Hospital Playlist,The stories of people going through their days...,"['Multiple Mains', 'Best Friends', 'Slow Roman...","['Friendship', 'Romance', 'Life', 'Medical']","['Jo Jung Suk', 'Yoo Yeon Seok', 'Jung Kyung H...",Shin Won Ho,Lee Woo Jung,Hospital Playlist,"[Hospital Playlist, Hospital Playlist 2, A Poe...","[100.0, 77.9, 41.7, 38.4, 36.6, 36.2, 35.4, 35..."


In [74]:
merged = pd.merge(two_df, rec_df)
# merged.to_csv('recs.csv', encoding='utf-8', index=False)
merged

Unnamed: 0,title,description,keywords,genres,actors,director,screenwriter,recommendations,similarity
0,Move to Heaven,Han Geu Roo is a 20-year-old with Autism. He w...,"['Uncle-Nephew Relationship', 'Autism', 'Death...","['Life', 'Drama', 'Family']","['Lee Je Hoon', 'Tang Jun Sang', 'Hong Seung H...",Kim Sung Ho,Yoon Ji Ryun,"[3 Leaf Clover, My Healing Love, Everybody Say...","[37.3, 37.0, 36.5, 34.2, 33.9, 33.2, 33.0, 33...."
1,Hospital Playlist,The stories of people going through their days...,"['Multiple Mains', 'Best Friends', 'Slow Roman...","['Friendship', 'Romance', 'Life', 'Medical']","['Jo Jung Suk', 'Yoo Yeon Seok', 'Jung Kyung H...",Shin Won Ho,Lee Woo Jung,"[Hospital Playlist, Hospital Playlist 2, A Poe...","[100.0, 77.9, 41.7, 38.4, 36.6, 36.2, 35.4, 35..."


In [88]:
print(merged.to_dict())

{'title': {0: 'Move to Heaven', 1: 'Hospital Playlist'}, 'description': {0: "Han Geu Roo is a 20-year-old with Autism. He works for his father’s business “Move To Heaven,” a company that specializes in crime scene cleanup, where they also collect and arrange items left by deceased people, and deliver them to the bereaved family.\n\nWhen Geu Roo's father dies, Geu Roo's guardianship passes to his uncle, ex-convict Cho Sang Gu, who is a martial arts fighter in underground matches. Per the father's will,", 1: "The stories of people going through their days that are seemingly ordinary but actually special, at the hospital, a place known as the microcosm of life, where someone is being born and someone's life meets their ending. The five doctors are long-time friends of 20 years who started their undergrad in 1999 in the same medical school and now are colleagues in the same hospital and have a band together.\n(Source: Naver,"}, 'keywords': {0: "['Uncle-Nephew Relationship', 'Autism', 'Deat

In [99]:
# rec_file = 'csv/recs.csv'
# df = pd.read_csv(rec_file)
title = "Move to Heaven"
index = search_kdrama(title)
row = merged.loc[index]

recs = row['recommendations']
sims = row['similarity']
ranks = []
scores = []

for name in recs:
    # get rank, score, and sim score
    index = search_kdrama(name)
    row = df.loc[index]
    ranks.append(row['rank'])
    scores.append(row['score'])

dictionary = {
    "titles": recs,
    "ranks": ranks,
    "scores": scores,
    "sim scores": sims
}

a = pd.DataFrame(dictionary)
a = a['sim_score'].reset_index(drop=True)



Unnamed: 0,titles,ranks,scores,sim scores
0,3 Leaf Clover,2272,7.0,37.3
1,My Healing Love,1603,7.6,37.0
2,Everybody Say Kungdari,2431,6.7,36.5
3,My Unfamiliar Family,145,8.4,34.2
4,Navillera,13,9.0,33.9
5,Heaven's Garden,1536,7.7,33.2
6,Dear My Friends,46,8.7,33.0
7,Can We Get Married?,1208,7.1,33.0
8,Love (ft. Marriage and Divorce) 3,1242,7.0,32.3
9,When My Love Blooms,540,7.8,32.3


In [21]:
# thing.drop("keywords", axis=1, inplace=True)
# thing.drop("genres", axis=1, inplace=True)
# thing.drop("actors", axis=1, inplace=True)
# thing.drop("director", axis=1, inplace=True)
# thing.drop("screenwriter", axis=1, inplace=True)
tthing = thing.reset_index(drop=True)
tthing = tthing.iloc[:20]
dic = tthing.to_dict()
print(dic)

{'titles': {0: '3 Leaf Clover', 1: 'Everybody Say Kungdari', 2: 'My Unfamiliar Family', 3: 'Navillera', 4: "Heaven's Garden", 5: 'Dear My Friends', 6: 'Our Blues', 7: 'Roses and Bean Sprout', 8: 'Good Doctor', 9: 'Hometown Over the Hill', 10: "I'll Show You the Taste", 11: 'Country Princess', 12: 'Rickety Rackety Family', 13: 'The Light in Your Eyes', 14: 'My Wife’s Having an Affair this Week', 15: 'Uncle', 16: 'Panda and Hedgehog', 17: "Let's Get Married", 18: 'Loveholic', 19: 'Medical Brothers'}, 'sim_score': {0: 0.37321290369421295, 1: 0.3646049282377252, 2: 0.34154477255967364, 3: 0.3392189619700139, 4: 0.33241292698120933, 5: 0.3303240868521011, 6: 0.32075355250708115, 7: 0.3076741442728228, 8: 0.30706051455756306, 9: 0.302935146571159, 10: 0.3002608840134835, 11: 0.2938989311427218, 12: 0.2739501944309067, 13: 0.2731022983836453, 14: 0.244774756888837, 15: 0.24354549922711646, 16: 0.22666296598915278, 17: 0.17053218671204687, 18: 0.1528484213431556, 19: 0.14217641330631592}}


Unnamed: 0,titles,sim_score
0,3 Leaf Clover,0.373213
1,Everybody Say Kungdari,0.364605
2,My Unfamiliar Family,0.341545
3,Navillera,0.339219
4,Heaven's Garden,0.332413
5,Dear My Friends,0.330324
6,Our Blues,0.320754
7,Roses and Bean Sprout,0.307674
8,Good Doctor,0.307061
9,Hometown Over the Hill,0.302935


In [17]:
sim_scores = sim_scores.reset_index(drop=True)
sim_scores

0    373212.903694
1    339218.961970
2    307060.514558
3    273950.194431
4    273102.298384
5    243545.499227
6    226662.965989
7     87092.374018
8     85393.740714
Name: sim_score, dtype: float64

In [27]:
# sim_scores.loc[:,] *= 100
sim_scores = sim_scores.round(decimals = 1)
sim_scores


0    37.3
1    33.9
2    30.7
3    27.4
4    27.3
5    24.4
6    22.7
7     8.7
8     8.5
Name: sim_score, dtype: float64

In [86]:
df = pd.read_csv(filename)
df = df[['link', 'title', 'rank', 'score']]

# full_df = pd.DataFrame()
for kdrama in kdrama_list:
    if kdrama == kdrama_list[0]:
        full_df = df.loc[df['title'] == kdrama]
        continue
    row = df.loc[df['title'] == kdrama]
    full_df = pd.concat([full_df, row], ignore_index=True)


merged_df = pd.concat([full_df, sim_scores], axis=1, ignore_index=True)
merged_df.columns = ['link', 'title', 'rank', 'score', 'sim_score']
merged_df = merged_df.sort_values("rank", ascending=True)
merged_df = merged_df.reset_index(drop=True)

dicti = merged_df.to_dict()
# print(dicti)
print(merged_df)

NameError: name 'kdrama_list' is not defined

In [85]:
print(dicti)

NameError: name 'dicti' is not defined

In [None]:
full_df = full_df.sort_values("rank", ascending=True)


In [11]:
# for i in range(9):
#     print(dicti['title'][i])

# for i in range(len(dicti['title]))
# for dic in dicti:
#     print(dicti[dic][0])

# print(len(dicti['title']))

for dic in dicti and dic == 'link':
    print(dic)

3 Leaf Clover
2272
7.0
0.37321290369421295
9


In [29]:
thing = {'title': {0: '3 Leaf Clover', 1: 'Navillera', 2: 'Good Doctor', 3: 'Rickety Rackety Family', 4: 'The Light in Your Eyes', 5: 'Uncle', 6: 'Panda and Hedgehog', 7: 'Pluto Squad', 8: 'Air City'}, 'rank': {0: 2272, 1: 13, 2: 176, 3: 2579, 4: 277, 5: 329, 6: 1165, 7: 1596, 8: 1233}, 'score': {0: 7.0, 1: 9.0, 2: 8.3, 3: 4.8, 4: 8.2, 5: 8.1, 6: 7.2, 7: 7.6, 8: 7.1}, 'sim_score': {0: 37.3, 1: 33.9, 2: 30.7, 3: 27.4, 4: 27.3, 5: 24.4, 6: 22.7, 7: 8.7, 8: 8.5}}


{'title': {0: '3 Leaf Clover', 1: 'Navillera', 2: 'Good Doctor', 3: 'Rickety Rackety Family', 4: 'The Light in Your Eyes', 5: 'Uncle', 6: 'Panda and Hedgehog', 7: 'Pluto Squad', 8: 'Air City'}, 'rank': {0: 2272, 1: 13, 2: 176, 3: 2579, 4: 277, 5: 329, 6: 1165, 7: 1596, 8: 1233}, 'score': {0: 7.0, 1: 9.0, 2: 8.3, 3: 4.8, 4: 8.2, 5: 8.1, 6: 7.2, 7: 7.6, 8: 7.1}, 'sim_score': {0: 37.3, 1: 33.9, 2: 30.7, 3: 27.4, 4: 27.3, 5: 24.4, 6: 22.7, 7: 8.7, 8: 8.5}}


In [37]:
df = pd.read_csv(filename)
df = df[['title', 'description', 'keywords', 'genres', 'actors', 'director', 'screenwriter']]


In [73]:
def search_kdrama(kdrama_name):
    """searches for kdrama with matching name and returns top result"""
    # return get_indices()[get_indices().index.str.contains(kdrama_name, regex=False, na=False)][0]
    return get_indices()[get_indices().index.str.contains(kdrama_name.lower(), case=False, regex=False, na=False)][0]

print(search_kdrama("move to heaven"))
def get_info(title):
    df = pd.read_csv(filename)
    fill_na()
    kdrama = search_kdrama(title)
    # print(kdrama)
    row = df.loc[kdrama]
    # return row
    return row.to_dict()
info = get_info("Move to Heaven")
# print(get_info("Move to Heaven"))
print(info)

0
{'link': 'https://mydramalist.com/49231-move-to-heaven', 'rank': 1, 'title': 'Move to Heaven', 'country': 'South Korea', 'description': "Han Geu Roo is a 20-year-old with Autism. He works for his father’s business “Move To Heaven,” a company that specializes in crime scene cleanup, where they also collect and arrange items left by deceased people, and deliver them to the bereaved family.\n\nWhen Geu Roo's father dies, Geu Roo's guardianship passes to his uncle, ex-convict Cho Sang Gu, who is a martial arts fighter in underground matches. Per the father's will,", 'ep': 10, 'genres': "['Life', 'Drama', 'Family']", 'keywords': "['Uncle-Nephew Relationship', 'Autism', 'Death', 'Savant Syndrome', 'Mourning', 'Tearjerker', 'Life Lesson', 'Ex-convict', 'Cleaning and Organizing', 'Asperger’s syndrome']", 'aired': '2021-05-14', 'network': 'Netflix', 'duration': '52 min.', 'content_rating': '18+ Restricted (violence & profanity)', 'score': 9.2, 'num_scored_by': 23705, 'num_watcher': '47,577', 

In [60]:
def remove_chars(string):
    remove_list = ["[", "]", "'"]
    for remove in remove_list:
        string = string.replace(remove, "")
    return string

columns = ['keywords', 'genres', 'actors']

for column in columns:
    info[column] = remove_chars(info[column]) 

# print(info)
print(info['actors'])

Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Jung Suk Yong, Jung Young Joo, Lee Moon Shik


In [29]:
import copy

import pandas as pd

ktitles = get_titles()

k_recs = {
    "title": "N/A",
    "recommendations": "N/A",
    "similarity": "N/A",
}

# list of dictionaries
kdata = []

# for each title, get top 20 recommended kdrama + similarity score

def store_data():
    count = 0
    for title in ktitles:
        # 20 kdrama recs + their sim score
        if count == 2: break
        
        dicti = get_top(title)

        row = copy.copy(k_recs)
        
        row['title'] = title
        row['recommendations'] = dicti['titles']
        row["similarity"] = dicti['sim_score']

        kdata.append(row)
        count += 1

store_data()
print(kdata)

# print(ktitles)

# convert array of dic to df
# get columns from this df and add to the kdrama df


[{'title': 'Move to Heaven', 'recommendations': {0: '3 Leaf Clover', 1: 'My Healing Love', 2: 'Everybody Say Kungdari', 3: 'My Unfamiliar Family', 4: 'Navillera', 5: "Heaven's Garden", 6: 'Dear My Friends', 7: 'Can We Get Married?', 8: 'Love (ft. Marriage and Divorce) 3', 9: 'When My Love Blooms', 10: 'Our Blues', 11: 'Love (ft. Marriage and Divorce) 2', 12: "It's Okay, Daddy's Girl", 13: 'Roses and Bean Sprout', 14: 'Good Doctor', 15: 'Victory For Tomorrow', 16: 'Hometown Over the Hill', 17: 'Hometown Over the Hill 2', 18: "I'll Show You the Taste", 19: 'Valid Love'}, 'similarity': {0: 0.37321290369421295, 1: 0.3696190662246558, 2: 0.3646049282377252, 3: 0.34154477255967364, 4: 0.3392189619700139, 5: 0.33241292698120933, 6: 0.3303240868521011, 7: 0.3297013483373359, 8: 0.32326482070752327, 9: 0.3227888198803688, 10: 0.32075355250708115, 11: 0.31800902449001633, 12: 0.3164866238101573, 13: 0.3076741442728228, 14: 0.30706051455756306, 15: 0.3033012458848712, 16: 0.302935146571159, 17: 0

In [33]:
things = pd.DataFrame(kdata)
things

Unnamed: 0,title,recommendations,similarity
0,Move to Heaven,"{0: '3 Leaf Clover', 1: 'My Healing Love', 2: ...","{0: 0.37321290369421295, 1: 0.3696190662246558..."
1,Hospital Playlist,"{0: 'Hospital Playlist', 1: 'Hospital Playlist...","{0: 1.0000000000000002, 1: 0.778721849812767, ..."


In [None]:
def create_csv_file():
    """creates a csv file with all kdrama info
    """
    with open(filename, 'w', newline='') as file:
        fieldnames = ['title', 'recommendations', 'sim score']

        writer = csv.DictWriter(file, fieldnames=fieldnames)

        writer.writeheader()
        
        for row in kdata:
            writer.writerow(row)

# recs = csv


csv_input = pd.read_csv(filename)
# csv_input['recommendations'] = 
csv_input.to_csv('output.csv', index=False)


# def add_to_csv():   
#     with open(filename, 'w', newline='') as csv_input:
#         with open(recs, 'w', newline='') as csv_output:
#             writer = csv.writer(csv_output, lineterminator='\n')
#             reader = csv.reader(csv_input)

csv_input

In [73]:
import copy
import colorama

import pandas as pd

# from rec_sys import get_titles, get_top, fill_na

ktitles = get_titles()
filename = r"C:\Users\John Kim\Desktop\kdrama_data.csv"

k_recs = {
    "title": "N/A",
    "recommendations": "N/A",
    "similarity": "N/A",
}

# list of dictionaries
kdata = []

# for each title, get top 20 recommended kdrama + similarity score
def store_data():
    # for every kdrama, get their recs + sim scores

    progress = 0
    total = len(ktitles)

    for title in ktitles:
        progress_bar(progress, total)

        # 20 kdrama recs + their sim score        
        recs_score = get_top(title)

        row = copy.copy(k_recs)
        
        row['title'] = title
        row['recommendations'] = recs_score[0]
        row["similarity"] = recs_score[1]

        kdata.append(row)

        progress += 1

def merge_data():
    store_data()
    # dataframe with all recs + sim score
    rec_df = pd.DataFrame(kdata)
    
    # dataframe with link, title, rank, score of all kdrama
    df = pd.read_csv(filename)
    fill_na()
    df = df[['link', 'title', 'rank', 'score']]

    # merged dataframe of the two
    merged_df = pd.merge(df, rec_df)
    merged_df.to_csv('recs.csv', encoding='utf-8', index=False)

def progress_bar(progress, total, color=colorama.Fore.YELLOW):
    '''just a progress bar'''
    percent = 100 * (progress / float(total))
    bar = ' ' * int(percent) + "-" * (100 - int(percent))
    print(color + f"\r|{bar}| {percent:.2f}%", end="\r")
    if progress == total:
        print(colorama.Fore.GREEN + f"\r|{bar}| {percent:.2f}%", end="\r")

merge_data()

# print(ktitles)

# convert array of dic to df
# get columns from this df and add to the kdrama df





|----------------------------------------------------------------------------------------------------| 0.73%

KeyboardInterrupt: 