In [4]:
import pandas as pd

# Load all metadata
drama_df = pd.read_json("data/dramas.jsonl", lines=True)
movie_df = pd.read_json("data/movies.jsonl", lines=True)
special_df = pd.read_json("data/specials.jsonl", lines=True)
tvshow_df = pd.read_json("data/tvshows.jsonl", lines=True)
reco_df = pd.read_json("data/recopairs.jsonl", lines=True)

# Combine all media metadata into one dataframe
media_df = pd.concat([drama_df, movie_df, special_df, tvshow_df], ignore_index=True)

In [5]:
def format_score(score_field):
    if isinstance(score_field, dict):
        value = score_field.get("value")
        votes = score_field.get("votes")
        if value and votes:
            return f"{value}/10 by {votes} users"
    return None


def join_names(field):
    if isinstance(field, list):
        return ", ".join(
            d["name"] for d in field if isinstance(d, dict) and "name" in d
        )
    return ""


def join_cast_dict(cast_field):
    result = {"main": [], "support": [], "guest": []}
    if not isinstance(cast_field, dict):
        return result
    for role in result:
        people = cast_field.get(role, [])
        if isinstance(people, list):
            result[role] = ", ".join(
                d["name"] for d in people if isinstance(d, dict) and "name" in d
            )
    return result

In [6]:
media_df["title"] = media_df["titles"].apply(lambda x: x.get("english", None))
media_df["score"] = media_df["score"].apply(format_score)
media_df["network"] = media_df["network"].apply(
    lambda x: x.get("name") if isinstance(x, dict) else None
)
media_df["genres"] = media_df["genres"].apply(join_names)
media_df["tags"] = media_df["tags"].apply(join_names)
media_df["directors"] = media_df["directors"].apply(join_names)
media_df["screenwriters"] = media_df["screenwriters"].apply(join_names)
media_df["cast"] = media_df["cast"].apply(join_cast_dict)

media_df.drop(["creator", "titles"], axis=1, inplace=True)

In [7]:
media_df.head(1)

Unnamed: 0,id,url,cover,synopsis,score,country,type,date,episodes,duration,network,genres,tags,rating,directors,screenwriters,cast,title
0,735043,https://mydramalist.com/735043-life,https://i.mydramalist.com/5v8b2y_4c.jpg?v=1,It is a story that resembles a tribute to our ...,9.4/10 by 36138 users,South Korea,Drama,2025-03-07,16.0,1 hr. 2 min.,Netflix,"Romance, Life, Drama","Healthy Mains’ Relationship, Family Relationsh...",13+ - Teens 13 or older,Kim Won Suk,Im Sang Choon,"{'main': 'IU, Park Bo Gum, Moon So Ri, Park Ha...",When Life Gives You Tangerines


In [8]:
def create_soup(row):
    parts = []

    def add(label, val):
        if val and str(val).strip().lower() != "nan":
            parts.append(f"{label}: {val}")

    add("Title", row.get("title"))
    add("Synopsis", row.get("synopsis"))
    add("Genres", row.get("genres"))
    add("Tags", row.get("tags"))
    add("Score", row.get("score"))
    add("Rating", row.get("rating"))
    add("Directors", row.get("directors"))
    add("Screenwriters", row.get("screenwriters"))
    add("Country", row.get("country"))
    add("Type", row.get("type"))

    cast = row.get("cast", {})
    if isinstance(cast, dict):
        if cast.get("main"):
            add("Main Cast", " ".join(cast["main"]))
        if cast.get("host"):
            add("Host", " ".join(cast["host"]))
        if not cast.get("main") and not cast.get("host") and cast.get("regular"):
            add("Regular Members", " ".join(cast["regular"]))

    add("Date", row.get("date"))
    add("Duration", row.get("duration"))
    add("Network", row.get("network"))
    add("Episodes", row.get("episodes"))

    return " ".join(parts)

In [9]:
media_df["soup"] = media_df.apply(create_soup, axis=1)

In [10]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = model.encode(
    media_df["soup"].tolist(),
    show_progress_bar=True,
    convert_to_tensor=True,
)

embeddings

Batches:   0%|          | 0/603 [00:00<?, ?it/s]

tensor([[ 0.0799,  0.0082,  0.0028,  ...,  0.0376,  0.0175, -0.0329],
        [ 0.0275,  0.0098, -0.0025,  ..., -0.0098,  0.0085, -0.0351],
        [ 0.0472,  0.0686,  0.0164,  ...,  0.0070, -0.0042, -0.0081],
        ...,
        [-0.0267,  0.0511,  0.0340,  ...,  0.0086,  0.0420, -0.0157],
        [ 0.0489,  0.0268, -0.0048,  ...,  0.0348,  0.0081,  0.0025],
        [ 0.0083,  0.0620, -0.0112,  ..., -0.0066,  0.0085, -0.0725]])

In [13]:
import torch

def recommend(title: str, top_k=5):
    # Find the index of the media by title
    match = media_df[media_df["title"].str.lower() == title.lower()]
    if match.empty:
        print("Title not found.")
        return []

    idx = match.index[0]
    query = embeddings[idx]

    # Compute cosine similarity with all media
    cos_sim = util.cos_sim(query, embeddings)[0]

    # Get top_k most similar (excluding self)
    top_k_scores, top_k_indices = torch.topk(cos_sim, k=top_k + 1)

    results = []
    for score, i in zip(top_k_scores[1:], top_k_indices[1:]):  # skip self-match
        media = media_df.iloc[i.item()]
        results.append((media["title"], score.item()))

    return results

In [14]:
recommendations = recommend("love next door", top_k=5)
for title, score in recommendations:
    print(f"{title} → Similarity: {score:.4f}")

Romance in the House → Similarity: 0.7830
Flower Boy Next Door → Similarity: 0.7812
Architecture 101 → Similarity: 0.7768
Unintentional Love Story Special → Similarity: 0.7732
The Boy Next Door → Similarity: 0.7685


In [15]:
import joblib

joblib.dump(media_df, "media_df.pkl")
joblib.dump(embeddings, "embeddings.pkl")

['embeddings.pkl']