In [7]:
import pandas as pd
df1 = pd.read_csv('kdrama.csv')
df1.head()

Unnamed: 0,Name,Aired Date,Year of release,Original Network,Aired On,Number of Episodes,Duration,Content Rating,Rating,Synopsis,Genre,Tags,Director,Screenwriter,Cast,Production companies,Rank
0,Move to Heaven,"May 14, 2021",2021,Netflix,Friday,10,52 min.,18+ Restricted (violence & profanity),9.2,Geu Roo is a young autistic man. He works for ...,"Life, Drama, Family","Autism, Uncle-Nephew Relationship, Death, Sava...",Kim Sung Ho,Yoon Ji Ryun,"Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju...","Page One Film, Number Three Pictures",#1
1,Flower of Evil,"Jul 29, 2020 - Sep 23, 2020",2020,tvN,"Wednesday, Thursday",16,1 hr. 10 min.,15+ - Teens 15 or older,9.1,Although Baek Hee Sung is hiding a dark secret...,"Thriller, Romance, Crime, Melodrama","Married Couple, Deception, Suspense, Family Se...","Kim Chul Gyu, Yoon Jong Ho",Yoo Jung Hee,"Lee Joon Gi, Moon Chae Won, Jang Hee Jin, Seo ...",Monster Union,#2
2,Hospital Playlist,"Mar 12, 2020 - May 28, 2020",2020,"Netflix, tvN",Thursday,12,1 hr. 30 min.,15+ - Teens 15 or older,9.1,The stories of people going through their days...,"Friendship, Romance, Life, Medical","Strong Friendship, Multiple Mains, Best Friend...",Shin Won Ho,Lee Woo Jung,"Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...","Egg Is Coming, CJ ENM",#3
3,Hospital Playlist 2,"Jun 17, 2021 - Sep 16, 2021",2021,"Netflix, tvN",Thursday,12,1 hr. 40 min.,15+ - Teens 15 or older,9.1,Everyday is extraordinary for five doctors and...,"Friendship, Romance, Life, Medical","Workplace, Strong Friendship, Best Friends, Mu...",Shin Won Ho,Lee Woo Jung,"Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...","Egg Is Coming, CJ ENM",#4
4,My Mister,"Mar 21, 2018 - May 17, 2018",2018,tvN,"Wednesday, Thursday",16,1 hr. 17 min.,15+ - Teens 15 or older,9.1,Park Dong Hoon is a middle-aged engineer who i...,"Psychological, Life, Drama, Family","Age Gap, Nice Male Lead, Strong Female Lead, H...","Kim Won Suk, Kim Sang Woo",Park Hae Young,"Lee Sun Kyun, IU, Park Ho San, Song Sae Byuk, ...",Chorokbaem Media,#5


In [9]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
columns_to_drop = ['Aired Date', 'Aired On', 'Duration', 'Content Rating', 'Number of Episodes', 'Synopsis', 'Tags', 'Director', 'Screenwriter', 'Production companies']
df1 = df1.drop(columns=columns_to_drop)

In [11]:
df1.head()

Unnamed: 0,Name,Year of release,Original Network,Rating,Genre,Cast,Rank
0,Move to Heaven,2021,Netflix,9.2,"Life, Drama, Family","Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju...",#1
1,Flower of Evil,2020,tvN,9.1,"Thriller, Romance, Crime, Melodrama","Lee Joon Gi, Moon Chae Won, Jang Hee Jin, Seo ...",#2
2,Hospital Playlist,2020,"Netflix, tvN",9.1,"Friendship, Romance, Life, Medical","Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...",#3
3,Hospital Playlist 2,2021,"Netflix, tvN",9.1,"Friendship, Romance, Life, Medical","Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...",#4
4,My Mister,2018,tvN,9.1,"Psychological, Life, Drama, Family","Lee Sun Kyun, IU, Park Ho San, Song Sae Byuk, ...",#5


In [19]:
#df1['Genre'] = df1['Genre'].str.split(', ').apply(lambda genres: [genre.strip() for genre in genres])
#df1['Cast'] = df1['Cast'].apply(lambda x: ', '.join(x.split(', ')[:3]))
#df1.set_index('Name', inplace=True)
#df1['Original Network'] = df1['Original Network'].str.split(', ').apply(lambda networks: [network.strip() for network in networks])
#df1['Rank'] = df1['Rank'].str.extract('(\d+)').astype(int)

In [17]:
def jaccard_similarity(list1, list2):
    intersection = len(set(list1).intersection(set(list2)))
    union = len(set(list1).union(set(list2)))
    return intersection / union

def get_recommendations(kdrama_name, df, variables, num_recommendations=5):
    # Check if the K-Drama name is in the dataset
    if kdrama_name not in df.index:
        return "K-Drama not found."

    # Separate numeric variables from non-numeric variables
    numeric_vars = df[variables].select_dtypes(include=['float64', 'int64']).columns.tolist()
    non_numeric_vars = [var for var in variables if var not in numeric_vars]

    combined_similarity = pd.Series(0, index=df.index)

    # Calculate year similarity based on the absolute difference in years
    if 'Year of release' in variables:
        year_diff = df['Year of release'].apply(lambda x: abs(x - df.loc[kdrama_name, 'Year of release']))
        year_similarity = (year_diff.max() - year_diff) / year_diff.max()
        combined_similarity += year_similarity

    # Calculate Jaccard similarity for genres
    if 'Genre' in variables:
        genre_similarity = df['Genre'].apply(lambda x: jaccard_similarity(df.loc[kdrama_name, 'Genre'], x))
        combined_similarity += genre_similarity

    # Calculate Jaccard similarity for original networks
    if 'Original Network' in variables:
        network_similarity = df['Original Network'].apply(lambda x: jaccard_similarity(df.loc[kdrama_name, 'Original Network'], x))
        combined_similarity += network_similarity

    if numeric_vars:
        # Prepare the DataFrame for comparison using only numeric variables
        df_subset = df[numeric_vars]
        
        # Standardize the numeric variables
        scaler = StandardScaler()
        df_scaled = pd.DataFrame(scaler.fit_transform(df_subset), index=df_subset.index, columns=df_subset.columns)

        # Compute cosine similarity for numeric variables
        cosine_sim = cosine_similarity(df_scaled)
        cosine_sim_df = pd.DataFrame(cosine_sim, index=df_scaled.index, columns=df_scaled.index)
        combined_similarity += cosine_sim_df[kdrama_name]

    combined_similarity = combined_similarity.sort_values(ascending=False)

    # Get the top recommendations
    similar_kdramas = combined_similarity.index[combined_similarity.index != kdrama_name]
    recommendations = df.loc[similar_kdramas][:num_recommendations]
    output_variables = ['Rating', 'Rank'] + variables

    return recommendations[output_variables]

# Ask the user for variables to use for similarity
variables = input('What similarities do you want to see between your K-Dramas? (comma-separated): ').split(', ')
variables = [var.strip() for var in variables]

# Ask for the K-Drama name to use as the initial reference
kdrama_name = input('Enter K-Drama name: ')

# Get the recommendations
recommendations = get_recommendations(kdrama_name, df1, variables)

# Output the recommendations
print(f"Top {len(recommendations)} Recommendations:")
print(recommendations)

What similarities do you want to see between your K-Dramas? (comma-separated):  Genre, Rating, Cast
Enter K-Drama name:  My Mister


Top 5 Recommendations:
                 Rating  Rank  \
Name                            
Move to Heaven      9.2     1   
Navillera           9.0    16   
Dear My Friends     8.7    44   
SKY Castle          8.9    21   
Go Back Couple      8.6    83   

                                                             Genre  Rating  \
Name                                                                         
Move to Heaven                               [Life, Drama, Family]     9.2   
Navillera                        [Friendship, Life, Drama, Family]     9.0   
Dear My Friends                   [Life, Drama, Family, Melodrama]     8.7   
SKY Castle                 [Mystery, Psychological, Drama, Family]     8.9   
Go Back Couple   [Comedy, Romance, Life, School, Drama, Family,...     8.6   

                                                       Cast  
Name                                                         
Move to Heaven   Lee Je Hoon, Tang Jun Sang, Hong Seung Hee  
Navillera   