In [117]:
import pandas as pd
from pathlib import Path
import polars as pl
import joblib
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
import numpy as np
from sklearn.metrics import confusion_matrix
from tqdm import tqdm
import gensim

assert gensim.models.word2vec.FAST_VERSION > -1
from gensim.models import Word2Vec
from datetime import datetime
import os

In [118]:
data_root = Path("../kkdata3/")
modelpath = Path("./Model")

In [119]:
train_source = pd.read_parquet(data_root / "label_train_source.parquet")
train_target = pd.read_parquet(data_root / "label_train_target.parquet")
test_source = pd.read_parquet(data_root / "label_test_source.parquet")
meta_song = pd.read_parquet(data_root / "meta_song.parquet")
meta_song_composer = pd.read_parquet(data_root / "meta_song_composer.parquet")
meta_song_genre = pd.read_parquet(data_root / "meta_song_genre.parquet")
meta_song_lyricist = pd.read_parquet(data_root / "meta_song_lyricist.parquet")
meta_song_producer = pd.read_parquet(data_root / "meta_song_producer.parquet")
meta_song_titletext = pd.read_parquet(data_root / "meta_song_titletext.parquet")

In [120]:
train_source.sort_values(["session_id", "listening_order"], inplace=True)
train_target.sort_values(["session_id", "listening_order"], inplace=True)
# test_source.sort_values(["session_id", "listening_order"], inplace=True)

In [121]:
# map song_id to song_index to save memory and speed up
meta_song["song_index"] = meta_song.index
train_source = train_source.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)
train_target = train_target.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)
test_source = test_source.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)

In [122]:
train_source["unix_played_at"] = pd.to_datetime(
    train_source["unix_played_at"], unit="s"
)
train_target["unix_played_at"] = pd.to_datetime(
    train_target["unix_played_at"], unit="s"
)
test_source["unix_played_at"] = pd.to_datetime(test_source["unix_played_at"], unit="s")

In [123]:
train_source.head()

Unnamed: 0,session_id,song_id,unix_played_at,play_status,login_type,listening_order,song_index
0,1,f6f06a71bb8bc38af6c0b7dae9cab00d,2022-08-09 02:35:05,0,7,1,354121
1,1,7b48a87effd31c9c07b68ed212062854,2022-08-09 02:38:50,0,7,2,1030664
2,1,61c46d6401aab1dde7c7de23dc55c037,2022-08-09 03:18:33,0,7,3,642780
3,1,7e54c9199aad70e35fe256d23701bad0,2022-08-09 03:21:29,0,7,4,280721
4,1,6178580fa01b62e9b52787902c0d8ae6,2022-08-09 03:30:41,0,7,5,90293


### Prepare training data


In [124]:
# concat train_source and test_source

source = pd.concat([train_source, test_source], axis=0, ignore_index=True)

# check if the concat is correct
source.shape[0] == (train_source.shape[0] + test_source.shape[0])

True

In [125]:
# prepare train data
# add artist & language column
def add_artist_language_playtime(df):
    _language = df.merge(
        meta_song[["song_index", "language_id"]], on="song_index", how="left"
    )
    _language_artist = _language.merge(
        meta_song[["song_id", "artist_id"]], on="song_id", how="left"
    )
    df_train_x = _language_artist

    # Calculate the play duration, set 20th song's play duration to NaN
    df_train_x["play_duration"] = df_train_x.groupby("session_id")[
        "unix_played_at"
    ].diff()
    df_train_x["play_duration"] = df_train_x["play_duration"].shift(-1)

    # check each session has 20 songs
    print(
        "check session has 20 songs:",
        (df_train_x["listening_order"] == 20).sum() == len(df_train_x) // 20,
    )
    df_train_x = df_train_x.drop(columns=["song_id", "login_type", "unix_played_at"])

    # step1 - data cleaning
    df_train_x["play_duration"] = df_train_x[
        "play_duration"
    ].dt.total_seconds()  # 將 play_duration 轉換為秒

    # handle nan
    df_train_x["artist_id"].fillna(-1, inplace=True)
    df_train_x["language_id"].fillna(-1, inplace=True)
    df_train_x["play_duration"].fillna(0, inplace=True)
    df_train_x

    return df_train_x


df_source = add_artist_language_playtime(source)
df_test = add_artist_language_playtime(test_source)

check session has 20 songs: True
check session has 20 songs: True


In [126]:
# pick top n songs by listening order


def pick_top_n_songs(dataframe, features=["song_index"], topN=20):
    df = dataframe.copy()

    # features = ["song_index"]  #["song_index", "artist_id", "play_duration"]
    new_df = pd.DataFrame()
    for i in range(1, topN + 1):  # 迴圈處理 20 首歌
        for feature in features:
            user_feature_name = f"{feature}_{i}"
            user_feature_values = df[df["listening_order"] == i][feature].values
            new_df[user_feature_name] = user_feature_values

    return new_df


features = ["song_index"]  # ["song_index", "artist_id", "play_duration"]

train = pick_top_n_songs(df_source, features=features, topN=20)
test = pick_top_n_songs(df_test, features=features, topN=20)

train.head()

Unnamed: 0,song_index_1,song_index_2,song_index_3,song_index_4,song_index_5,song_index_6,song_index_7,song_index_8,song_index_9,song_index_10,song_index_11,song_index_12,song_index_13,song_index_14,song_index_15,song_index_16,song_index_17,song_index_18,song_index_19,song_index_20
0,354121,1030664,642780,280721,90293,766515,769968,1007853,249587,615299,166527,973329,489807,789395,832194,962778,913263,910165,194387,722034
1,256188,144246,400761,625467,281059,880625,849960,40747,360309,123421,360309,123421,158192,534994,442771,659818,415453,307906,523383,672083
2,514974,300949,632246,353623,93125,165609,240446,380094,139121,316125,516472,389977,385637,673916,623997,332935,133500,606328,578482,261314
3,277607,219437,138361,622824,967426,700237,868800,490471,835071,628370,7085,238820,503008,238820,503008,503008,824980,267047,238820,709693
4,892347,509757,97064,66960,896666,74810,850960,422356,328649,328649,325229,672168,337713,14961,685694,7267,589816,482509,77515,246245


### Training

In [127]:
# build model
# Convert DataFrame to a 2D list
sentences = train.values.tolist()

model_w2v_sg = Word2Vec(
    sentences=sentences,
    epochs=50,  # epoch
    min_count=5,  # a movie has to appear more than 5 times to be keeped
    vector_size=250,  # size of the hidden layer
    workers=os.cpu_count(),
    sg=0,  # 0: CBOW, 1: skip-gram
    hs=0,
    negative=3,  # 2-5 for big dataset
    window=4,
)

In [128]:
model_w2v_sg.save(str(modelpath / "item2vec_window_5"))

# Inference


In [129]:
def songindex2songid(df):
    _df = df.copy()
    count = _df.shape[1]
    for i in range(1, 6):
        _df = _df.rename(columns={f"top{i}": "song_index"})
        _df = _df.merge(
            meta_song[["song_index", "song_id"]], on="song_index", how="left"
        )
        _df.rename(columns={"song_id": f"top{i}"}, inplace=True)
        _df.drop(columns=["song_index"], inplace=True)

    return _df

In [130]:
def recommend(model, sentences: list, topn: int = 5):
    try:
        sims = model.wv.most_similar(sentences, topn=topn)
        ids = [item[0] for item in sims]
    except:
        ids = [np.nan for i in range(topn)]
    return ids

In [131]:
# Load model
model = Word2Vec.load(str(modelpath / "item2vec_window_5"))
word_vectors = model.wv


# use last 5 songs to predict 1-5 songs
last5df = test[[f"song_index_{i}" for i in range(16, 21)]].copy()

# prepare test data
test_sentences = last5df.values.tolist()  # test.values.tolist()

In [132]:
# Inference
ids_list = []
for sentence in test_sentences:
    ids = recommend(model, sentence, topn=1)
    ids_list.append(ids)

In [133]:
# Transform 2d list to dataframe
columns = [f"top{i}" for i in range(1, 2)]

ids_result = pd.DataFrame(ids_list, columns=columns)
ids_result.head()
ids_result.to_csv("./results/lst5_tmp_t1_5.csv", index=False)

In [134]:
# ids_result = pd.read_csv("./results/tmp_t1_5.csv")

In [135]:
# get nan rates for each item
nan_rates = ids_result.isna().sum() / len(ids_result)
nan_rates

top1    0.151645
dtype: float64

In [136]:
# fill nan with empty cols
ids_result["top2"] = np.nan
ids_result["top3"] = np.nan
ids_result["top4"] = np.nan
ids_result["top5"] = np.nan

In [137]:
# song_index to song_id
ids_result = songindex2songid(ids_result)

# fill nan with best previous result
best = pd.read_csv("./results/best.csv")
res = ids_result.fillna(best)

In [138]:
res

Unnamed: 0,top1,top2,top3,top4,top5
0,aad06498fcd9699bee8bfb6bd9181943,aad06498fcd9699bee8bfb6bd9181943,36e0b2fd6046c0b9233ca5bb4a67b853,17676cf990988cbae50b816761c569d5,e8bbc37dee09b8e548f0e2137dd1c2a5
1,98724b33dd2903c11737c4bedff9b532,308f5f2b339b3ff74c9b60b49832d27f,1ac480c8add024f6febd936dac34bf82,1ac480c8add024f6febd936dac34bf82,1ac480c8add024f6febd936dac34bf82
2,f9bd80a0d031c2a1636178c0058b4377,d1492ba4a2fdf555510934e9740ea8c3,03ef6a308051b6a56e666f4bf549ba63,c8f890e2f65c70ecb903690f439ac36b,d3b103cacf16ac7c59f1b4d264def355
3,f1fea83942691e46c8bf0a9e76872e6d,4c19e9de0a90902ccd3b55374942fc5d,c625e7b519d15a6f055a3c8697fe90b6,c625e7b519d15a6f055a3c8697fe90b6,c625e7b519d15a6f055a3c8697fe90b6
4,b457a62ce7922f627f35f0f91ee13e21,23cb1d20ca90b3574d6f9537c9198525,1943cae8d1ded4ea75126e5da5b87916,c534b7ad0d186ee663f11d29642d692c,1ae353d3f12ec67cbf67ce184b017cd3
...,...,...,...,...,...
143059,b6ac1057e04709cdba6d4e73c4c56b50,d3b629c521230e2b730b80ac65a40b24,d3b629c521230e2b730b80ac65a40b24,d3b629c521230e2b730b80ac65a40b24,d3b629c521230e2b730b80ac65a40b24
143060,1f604ad500399bfeba17734d0881673c,1f604ad500399bfeba17734d0881673c,1f604ad500399bfeba17734d0881673c,1f604ad500399bfeba17734d0881673c,1f604ad500399bfeba17734d0881673c
143061,df8c6e6316a04ae2eb23725488d6034e,6694c67088568828cc3c10340dc9614e,4601c63be6a76e3f42cad84dd7bc77aa,4601c63be6a76e3f42cad84dd7bc77aa,4601c63be6a76e3f42cad84dd7bc77aa
143062,3a756f3040d7d3e6aadebc714bfcd29e,99cc48f94d8048f8124528e0df1f3129,8cd701995b27680b7e57f40bee09bdaa,0bbdb16d903891668947f716243b2935,b0ac0a1cccd72a43f51388919929f72f


In [139]:
# add session_id
res["session_id"] = test_source[test_source["listening_order"] == 1][
    "session_id"
].values

# change columns sequence
res = res[["session_id", "top1", "top2", "top3", "top4", "top5"]]
res.to_csv("./results/item2vec_result.csv", index=False)
res.head()

Unnamed: 0,session_id,top1,top2,top3,top4,top5
0,598,aad06498fcd9699bee8bfb6bd9181943,aad06498fcd9699bee8bfb6bd9181943,36e0b2fd6046c0b9233ca5bb4a67b853,17676cf990988cbae50b816761c569d5,e8bbc37dee09b8e548f0e2137dd1c2a5
1,1039,98724b33dd2903c11737c4bedff9b532,308f5f2b339b3ff74c9b60b49832d27f,1ac480c8add024f6febd936dac34bf82,1ac480c8add024f6febd936dac34bf82,1ac480c8add024f6febd936dac34bf82
2,1199,f9bd80a0d031c2a1636178c0058b4377,d1492ba4a2fdf555510934e9740ea8c3,03ef6a308051b6a56e666f4bf549ba63,c8f890e2f65c70ecb903690f439ac36b,d3b103cacf16ac7c59f1b4d264def355
3,1489,f1fea83942691e46c8bf0a9e76872e6d,4c19e9de0a90902ccd3b55374942fc5d,c625e7b519d15a6f055a3c8697fe90b6,c625e7b519d15a6f055a3c8697fe90b6,c625e7b519d15a6f055a3c8697fe90b6
4,1868,b457a62ce7922f627f35f0f91ee13e21,23cb1d20ca90b3574d6f9537c9198525,1943cae8d1ded4ea75126e5da5b87916,c534b7ad0d186ee663f11d29642d692c,1ae353d3f12ec67cbf67ce184b017cd3
