In [210]:
import pandas as pd

# Load all metadata
drama_df = pd.read_json("data/dramas.jsonl", lines=True)
movie_df = pd.read_json("data/movies.jsonl", lines=True)
special_df = pd.read_json("data/specials.jsonl", lines=True)
tvshow_df = pd.read_json("data/tvshows.jsonl", lines=True)
reco_df = pd.read_json("data/recopairs.jsonl", lines=True)

# Combine all media metadata into one dataframe
media_df = pd.concat([drama_df, movie_df, special_df, tvshow_df], ignore_index=True)

In [211]:
def format_score(score_field):
    if isinstance(score_field, dict):
        value = score_field.get("value")
        votes = score_field.get("votes")
        if value and votes:
            return f"{value}/10 by {votes} users"
    return None


def join_names(field):
    if isinstance(field, list):
        return ", ".join(
            d["name"] for d in field if isinstance(d, dict) and "name" in d
        )
    return ""


def join_cast_dict(cast_field):
    result = {"main": [], "support": [], "guest": []}
    if not isinstance(cast_field, dict):
        return result
    for role in result:
        people = cast_field.get(role, [])
        if isinstance(people, list):
            result[role] = ", ".join(
                d["name"] for d in people if isinstance(d, dict) and "name" in d
            )
    return result

In [212]:
media_df["title"] = media_df["titles"].apply(lambda x: x.get("english", None))
media_df["score"] = media_df["score"].apply(format_score)
media_df["network"] = media_df["network"].apply(
    lambda x: x.get("name") if isinstance(x, dict) else None
)
media_df["genres"] = media_df["genres"].apply(join_names)
media_df["tags"] = media_df["tags"].apply(join_names)
media_df["directors"] = media_df["directors"].apply(join_names)
media_df["screenwriters"] = media_df["screenwriters"].apply(join_names)
media_df["cast"] = media_df["cast"].apply(join_cast_dict)

media_df.drop(["creator", "titles"], axis=1, inplace=True)

In [213]:
media_df.head(1)

Unnamed: 0,id,url,cover,synopsis,score,country,type,date,episodes,duration,network,genres,tags,rating,directors,screenwriters,cast,title
0,735043,https://mydramalist.com/735043-life,https://i.mydramalist.com/5v8b2y_4c.jpg?v=1,It is a story that resembles a tribute to our ...,9.4/10 by 36138 users,South Korea,Drama,2025-03-07,16.0,1 hr. 2 min.,Netflix,"Romance, Life, Drama","Healthy Mains’ Relationship, Family Relationsh...",13+ - Teens 13 or older,Kim Won Suk,Im Sang Choon,"{'main': 'IU, Park Bo Gum, Moon So Ri, Park Ha...",When Life Gives You Tangerines


In [214]:
def create_soup(row):
    parts = []

    def add(label, val):
        if val and str(val).strip().lower() != "nan":
            parts.append(f"{label}: {val}")

    add("Title", row.get("title"))
    add("Synopsis", row.get("synopsis"))
    add("Genres", row.get("genres"))
    add("Tags", row.get("tags"))
    add("Score", row.get("score"))
    add("Rating", row.get("rating"))
    add("Directors", row.get("directors"))
    add("Screenwriters", row.get("screenwriters"))
    add("Country", row.get("country"))
    add("Type", row.get("type"))
    add("Date", row.get("date"))
    add("Duration", row.get("duration"))
    add("Network", row.get("network"))
    add("Episodes", row.get("episodes"))

    cast = row.get("cast", {})
    if isinstance(cast, dict):
        for role in ["main", "host"]:
            if cast.get(role):
                add(f"{role.capitalize()} Cast", " ".join(cast[role]))

    return " ".join(parts)

In [215]:
media_df["soup"] = media_df.apply(create_soup, axis=1)

In [219]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = model.encode(
    media_df["soup"].tolist(),
    show_progress_bar=True,
    convert_to_tensor=True,
)

embeddings

Batches:   0%|          | 0/603 [00:00<?, ?it/s]

tensor([[ 0.0806,  0.0054,  0.0013,  ...,  0.0383,  0.0193, -0.0324],
        [ 0.0238,  0.0137, -0.0026,  ..., -0.0102,  0.0089, -0.0317],
        [ 0.0447,  0.0714,  0.0140,  ...,  0.0071, -0.0037, -0.0057],
        ...,
        [-0.0267,  0.0511,  0.0340,  ...,  0.0086,  0.0420, -0.0157],
        [ 0.0489,  0.0268, -0.0048,  ...,  0.0348,  0.0081,  0.0025],
        [ 0.0083,  0.0620, -0.0112,  ..., -0.0066,  0.0085, -0.0725]])

In [220]:
def recommend(title: str, top_k=5):
    # Find the index of the media by title
    match = media_df[media_df["title"].str.lower() == title.lower()]
    if match.empty:
        print("Title not found.")
        return []

    idx = match.index[0]
    query = embeddings[idx]

    # Compute cosine similarity with all media
    cos_sim = util.cos_sim(query, embeddings)[0]

    # Get top_k most similar (excluding self)
    top_k_scores, top_k_indices = torch.topk(cos_sim, k=top_k + 1)

    results = []
    for score, i in zip(top_k_scores[1:], top_k_indices[1:]):  # skip self-match
        media = media_df.iloc[i.item()]
        results.append((media["title"], score.item()))

    return results

In [229]:
recommendations = recommend("love next door", top_k=5)
for title, score in recommendations:
    print(f"{title} → Similarity: {score:.4f}")

My First First Love → Similarity: 0.7921
More Than Friends → Similarity: 0.7876
The Boy Next Door → Similarity: 0.7863
EXO Next Door → Similarity: 0.7843
Flower Boy Next Door → Similarity: 0.7811


In [232]:
import joblib

joblib.dump(media_df, "media_df.pkl")
joblib.dump(embeddings, "embeddings.jl")


['embeddings.jl']

In [236]:
media_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19274 entries, 0 to 19273
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             19274 non-null  int64         
 1   url            19274 non-null  object        
 2   cover          19215 non-null  object        
 3   synopsis       19274 non-null  object        
 4   score          16269 non-null  object        
 5   country        19274 non-null  object        
 6   type           19274 non-null  object        
 7   date           18826 non-null  datetime64[ns]
 8   episodes       14307 non-null  float64       
 9   duration       19274 non-null  object        
 10  network        12057 non-null  object        
 11  genres         19274 non-null  object        
 12  tags           19274 non-null  object        
 13  rating         8909 non-null   object        
 14  directors      19274 non-null  object        
 15  screenwriters  1927