In [None]:
import pandas as pd
from pathlib import Path
import polars as pl
import joblib
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
import numpy as np
from sklearn.metrics import confusion_matrix
from tqdm import tqdm


data_root = Path("../kkdata3/")
train_source = pd.read_parquet(data_root / "label_train_source.parquet")
train_target = pd.read_parquet(data_root / "label_train_target.parquet")
test_source = pd.read_parquet(data_root / "label_test_source.parquet")
meta_song = pd.read_parquet(data_root / "meta_song.parquet")
meta_song_composer = pd.read_parquet(data_root / "meta_song_composer.parquet")
meta_song_genre = pd.read_parquet(data_root / "meta_song_genre.parquet")
meta_song_lyricist = pd.read_parquet(data_root / "meta_song_lyricist.parquet")
meta_song_producer = pd.read_parquet(data_root / "meta_song_producer.parquet")
meta_song_titletext = pd.read_parquet(data_root / "meta_song_titletext.parquet")


train_source.dtypes

In [None]:
train_source.sort_values(["session_id", "listening_order"], inplace=True)
train_target.sort_values(["session_id", "listening_order"], inplace=True)
# test_source.sort_values(["session_id", "listening_order"], inplace=True)

In [None]:
# map song_id to song_index to save memory and speed up
meta_song["song_index"] = meta_song.index
train_source = train_source.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)
train_target = train_target.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)
test_source = test_source.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)

In [None]:
from datetime import datetime

train_source["unix_played_at"] = pd.to_datetime(
    train_source["unix_played_at"], unit="s"
)
train_target["unix_played_at"] = pd.to_datetime(
    train_target["unix_played_at"], unit="s"
)
test_source["unix_played_at"] = pd.to_datetime(test_source["unix_played_at"], unit="s")

## simple train a ML model - 20 predict 1

follow ramdomforest, language is not important.

In [None]:
# prepare train data
# add artist & language column
def add_artist_language_playtime(df):
    _language = df.merge(
        meta_song[["song_index", "language_id"]], on="song_index", how="left"
    )
    _language_artist = _language.merge(
        meta_song[["song_id", "artist_id"]], on="song_id", how="left"
    )
    df_train_x = _language_artist

    # Calculate the play duration, set 20th song's play duration to NaN
    df_train_x["play_duration"] = df_train_x.groupby("session_id")[
        "unix_played_at"
    ].diff()
    df_train_x["play_duration"] = df_train_x["play_duration"].shift(-1)

    # check each session has 20 songs
    print(
        "check session has 20 songs:",
        (df_train_x["listening_order"] == 20).sum() == len(df_train_x) // 20,
    )
    df_train_x = df_train_x.drop(columns=["song_id", "login_type", "unix_played_at"])

    # step1 - data cleaning
    df_train_x["play_duration"] = df_train_x[
        "play_duration"
    ].dt.total_seconds()  # 將 play_duration 轉換為秒

    # handle nan
    df_train_x["artist_id"].fillna(-1, inplace=True)
    df_train_x["language_id"].fillna(-1, inplace=True)
    df_train_x["play_duration"].fillna(0, inplace=True)
    df_train_x

    return df_train_x


df_train_x = add_artist_language_playtime(train_source)
df_train_x

In [None]:
# Transform the data format to be model training friendly

df = df_train_x.copy()

# # step2 - feature engineering
# # 為每首歌創建特徵（假設 cols 是相關列）
features = ["song_index", "artist_id", "play_duration"]

new_df = pd.DataFrame()
for i in range(1, 21):  # 迴圈處理 20 首歌
    for feature in features:
        user_feature_name = f"{feature}_{i}"
        user_feature_values = df[df["listening_order"] == i][feature].values
        new_df[user_feature_name] = user_feature_values

train_x = new_df
train_x.head(5)

In [None]:
# Transform the train_Y format to be model training friendly

df = train_target.copy()

# # step2 - feature engineering
# # 為每首歌創建特徵（假設 cols 是相關列）
features = ["song_index"]

new_df = pd.DataFrame()
for i in range(21, 26):  # 迴圈處理 20 首歌
    for feature in features:
        user_feature_name = f"{feature}_{i}"
        user_feature_values = df[df["listening_order"] == i][feature].values
        new_df[user_feature_name] = user_feature_values

train_y_5 = new_df
print(train_y_5.shape)
train_y_5.head(5)

In [None]:
mask_21 = train_target[train_target["listening_order"] == 21].reset_index()
train_y = mask_21[["song_index"]]
train_y

### use top10 playduration predict 1-> ramdomforest.


In [None]:
import pandas as pd

# Assuming train_x is a DataFrame

topk = 20
train_x_k_rows = []

# Iterate over rows using iterrows
for idx, row in train_x.iterrows():
    # Get topk play duration indices
    playdurations = [row[f"play_duration_{i}"] for i in range(1, 21)]
    topk_indices = sorted(
        range(len(playdurations)), key=lambda i: playdurations[i], reverse=True
    )[:topk]

    # Create a new row dictionary
    new_row = {}
    for i, idx in enumerate(topk_indices, start=1):
        new_row[f"song_index_{i}"] = row[f"song_index_{idx + 1}"].astype(int)
        # new_row[f"play_duration_{i}"] = row[f"play_duration_{idx + 1}"].astype(int)
        # new_row[f"artist_id_{i}"] = row[f"artist_id_{idx + 1}"].astype(int)

    # Append the new row to the list
    train_x_k_rows.append(new_row)

# Create a DataFrame from the list of rows
train_x_topk = pd.DataFrame(train_x_k_rows)

# Display the resulting DataFrame
train_x_topk

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_x_topk, train_y_5, test_size=0.1, random_state=42
)
# X_train, X_test, y_train, y_test = train_test_split(
#     train_x_10, train_y.values.ravel(), test_size=0.1, random_state=42
# )

# traning
model = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
model.fit(X_train, y_train)

# save model
joblib.dump(model, "./knn.joblib")
print("finish")

In [None]:
# k = 3 uniform
#     top 5 songid => predict top1 => 0.050
#     top 10 songid => predict top1 => precision: 0.055
#     top 15 songid => predict top1 => precision:0.057
#     top 20 songid => predict top1 => precision:0.06


#     top 5 songid => predict top1 ~ top5 => acc: 0.23
#     top 10 songid => predict top1 ~ top5 => acc: 0.267, dup:0.056
#     top 15 songid => predict top1 ~ top5 => acc: 0.28, dup:0.059
#     top 20 songid => predict top1 ~ top5 => acc: 0.284, dup:0.0592

#     top5 songid+ artist => predict top1 => precision:0.04
#     top5 songid + artist + play_dur => predict top1 => precision:0.049

# k = 3 distance
#     top 5 songid => predict top1 => 0.050

# k = 5 uniform
#     top 5 songid => predict top1 => 0.048

# k = 7 uniform
#     top 5 songid => predict top1 => 0.045

In [None]:
# inference test data

X = X_test
y = y_test

predicted_labels = []
print("Predicting")
model = joblib.load("./knn.joblib")

predicted_labels = model.predict(X)

if predicted_labels.shape[1] == 1:
    # 计算精度和召回率
    precision = precision_score(y, predicted_labels, average="weighted")
    recall = recall_score(y, predicted_labels, average="weighted")
    print("Precision:", precision)
    print("Recall:", recall)
else:
    true_label_matches = np.sum(y == predicted_labels)
    print("accuracy:", true_label_matches.sum() / len(y))

In [None]:
# check duplicated count
duplicated_count = 0
for i in range(predicted_labels.shape[0]):
    duplicated_count += len(predicted_labels[i, :]) - len(
        np.unique(predicted_labels[i, :])
    )
print(duplicated_count / (predicted_labels.shape[0] * predicted_labels.shape[1]))

# Inference


In [15]:
X = add_artist_language_playtime(test_source)
X

check session has 20 songs: True


Unnamed: 0,session_id,play_status,listening_order,song_index,language_id,artist_id,play_duration
0,598,1,1,553537,62.0,49372.0,220.0
1,598,1,2,323208,62.0,29463.0,79.0
2,598,1,3,700158,52.0,395.0,2.0
3,598,1,4,524823,62.0,10939.0,3.0
4,598,1,5,903722,-1.0,31927.0,1.0
...,...,...,...,...,...,...,...
2861275,714498,0,16,408927,52.0,41290862.0,84.0
2861276,714498,0,17,125430,52.0,41290862.0,104.0
2861277,714498,0,18,490246,52.0,41290862.0,98.0
2861278,714498,0,19,482103,52.0,41290862.0,92.0


In [18]:
# Transform the data format to be model training friendly

df = X.copy()

# # step2 - feature engineering
features = ["song_index", "artist_id", "play_duration"]

new_df = pd.DataFrame()
for i in range(1, 21):  # 迴圈處理 20 首歌
    for feature in features:
        user_feature_name = f"{feature}_{i}"
        user_feature_values = df[df["listening_order"] == i][feature].values
        new_df[user_feature_name] = user_feature_values

infer_X = new_df
infer_X.head(5)

Unnamed: 0,song_index_1,artist_id_1,play_duration_1,song_index_2,artist_id_2,play_duration_2,song_index_3,artist_id_3,play_duration_3,song_index_4,...,song_index_18,artist_id_18,play_duration_18,song_index_19,artist_id_19,play_duration_19,song_index_20,artist_id_20,play_duration_20,session_id
0,553537,49372.0,220.0,323208,29463.0,79.0,700158,395.0,2.0,524823,...,309466,153.0,1.0,487545,3404.0,311.0,320485,3404.0,0.0,598
1,433605,39.0,365.0,245577,152.0,224.0,973295,1589.0,42173.0,472361,...,810606,4156.0,27.0,388680,4156.0,81.0,82103,4156.0,0.0,1039
2,816505,28686876.0,67.0,247342,6142842.0,187.0,865843,-1.0,127.0,535597,...,524289,-1.0,141.0,187475,-1.0,92.0,82260,-1.0,0.0,1199
3,307437,9944357.0,5.0,196724,-1.0,199.0,307437,9944357.0,5.0,687353,...,674238,830153.0,1.0,672342,31883961.0,4.0,163516,-1.0,0.0,1489
4,578489,31573048.0,173.0,828195,695607.0,215.0,143937,5047441.0,213.0,882896,...,935208,4261.0,2.0,798178,4261.0,3.0,736058,4261.0,0.0,1868


In [17]:
import pandas as pd

# Assuming train_x is a DataFrame

topk = 20
train_x_k_rows = []
train_x = infer_X

# Iterate over rows using iterrows
for idx, row in train_x.iterrows():
    # Get topk play duration indices
    playdurations = [row[f"play_duration_{i}"] for i in range(1, 21)]
    topk_indices = sorted(
        range(len(playdurations)), key=lambda i: playdurations[i], reverse=True
    )[:topk]

    # Create a new row dictionary
    new_row = {}
    for i, idx in enumerate(topk_indices, start=1):
        new_row[f"song_index_{i}"] = row[f"song_index_{idx + 1}"].astype(int)
        # new_row[f"play_duration_{i}"] = row[f"play_duration_{idx + 1}"].astype(int)
        # new_row[f"artist_id_{i}"] = row[f"artist_id_{idx + 1}"].astype(int)

    # Append the new row to the list
    train_x_k_rows.append(new_row)

# Create a DataFrame from the list of rows
train_x_topk = pd.DataFrame(train_x_k_rows)

# Display the resulting DataFrame
train_x_topk

Unnamed: 0,song_index_1,song_index_2,song_index_3,song_index_4,song_index_5,song_index_6,song_index_7,song_index_8,song_index_9,song_index_10,song_index_11,song_index_12,song_index_13,song_index_14,song_index_15,song_index_16,song_index_17,song_index_18,song_index_19,song_index_20
0,834888,487545,553537,323208,592510,524823,479355,700158,100440,294174,442405,903722,443388,368778,553787,283768,226534,309466,178080,320485
1,973295,433605,245577,909907,674158,229992,828106,472361,388680,279508,252014,516075,851171,464527,231212,207583,810606,914065,340530,82103
2,976710,38301,247342,524289,865843,535597,187475,816505,662248,349590,419795,38301,865843,662248,662248,38301,976710,524289,865843,82260
3,687353,196724,387759,271372,506738,305915,642783,307437,307437,672342,406693,271116,325257,567752,181886,541741,847611,493730,674238,163516
4,882896,111485,215121,179257,111963,828195,143937,578489,42708,592426,586867,516472,742718,262260,856473,798178,783944,748582,935208,736058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143059,334221,334221,334221,334221,813028,813028,813028,813028,1028535,334221,334221,1028535,1028535,1028535,1028535,1028535,1028535,1028535,1028535,813028
143060,754327,649889,140463,439239,450010,496212,467984,508806,743089,813028,480544,567595,543644,813028,610999,981160,947579,766896,825451,352697
143061,260235,997018,539300,831678,22613,926234,105976,943003,211614,578489,278263,578489,578489,426731,592650,753442,578489,16604,22613,831678
143062,781078,735314,708757,791875,720557,728673,964762,1024784,731401,775028,1027138,744294,764214,980686,1001317,790777,740929,920855,925306,862975


In [20]:
print("Predicting")
model = joblib.load("./knn.joblib")

predicted_labels = model.predict(train_x_topk)
predicted_labels

Predicting


array([[ 36801, 634127, 267538, 498265,  18194],
       [141011, 204454, 241200,  77130, 171685],
       [213566, 502592, 323156, 312764, 251740],
       ...,
       [428286, 465163,  27312, 315348, 181726],
       [964198, 851907, 867350, 974620, 957467],
       [490245, 324624, 164320, 349665,  72888]])

In [72]:
# numpy to dataframe and rename columns
column_names = [f"top{i}" for i in range(1, 6)]
df_y = pd.DataFrame(predicted_labels, columns=column_names)
df_y

Unnamed: 0,top1,top2,top3,top4,top5
0,36801,634127,267538,498265,18194
1,141011,204454,241200,77130,171685
2,213566,502592,323156,312764,251740
3,121263,299082,4358,213194,96005
4,71821,163401,480932,709930,388334
...,...,...,...,...,...
143059,430573,843898,996074,458318,625268
143060,498480,371904,6957,142582,155619
143061,428286,465163,27312,315348,181726
143062,964198,851907,867350,974620,957467


In [75]:
def songindex2songid(df):
    _df = df.copy()
    count = _df.shape[1]
    for i in range(1, 6):
        _df = _df.rename(columns={f"top{i}": "song_index"})
        _df = _df.merge(
            meta_song[["song_index", "song_id"]], on="song_index", how="left"
        )
        _df.rename(columns={"song_id": f"top{i}"}, inplace=True)
        _df.drop(columns=["song_index"], inplace=True)

    return _df

In [76]:
res = songindex2songid(df_y)

# add session_id
res["session_id"] = X[X["listening_order"] == 1]["session_id"].values

# change columns sequence
res = res[["session_id", "top1", "top2", "top3", "top4", "top5"]]
res.to_csv("./results/result.csv", index=False)
res.head()

Unnamed: 0,session_id,top1,top2,top3,top4,top5
0,598,3fc59c057904d5a23afed7044463b20f,9f4125fa9d29789e516e6a2bf4de3370,cad5adfc97c2d8a517b528676f5de932,e751cabad7cd23895cd7c5cb059721ee,f1a05ca007d31b50567ffa9b030ff0fb
1,1039,7d088c33aad7ddf0af4e1f072e1ba930,89b32d58a60cd372997d2bb78d6f276b,9ce45ff3d60ad99226fcb40f50ce4e0e,6e3027a300b2d3cc7b53d4c896dd79a1,8e954819b7e4d063382bb7aedce4fdba
2,1199,e5bee091225de44fe5410bd6453fe360,8e81f5bdbeefdf92d48fc5c3581d8cfc,748f588c3884ddda4191923d6e897c66,a0c49cdd4c00d052b9c6ba428b6516b3,713e5d22dbb40bf7b9c50ffa9f4c0e95
3,1489,887265d6d8f970c8807ca770eea5cc84,4b5c007a335effef94afa4a9ef75cd4e,051987c27004448e041f1defb07b3788,de0f87fc06cb1ec179dea13e19aba174,14c2d1ff9f2b2bce27f827e249199482
4,1868,48359cbc06190557fcee71b068c2211c,85543d27c0726f2c4abb9cb74ea1c499,a72d5f1bfea2ced3fc51ba95b704d085,cd057497d574fdd3dd40d92b3c3123b7,045f7f241583ec76b6df84f06520aa33


0         False
1         False
2         False
3         False
4         False
          ...  
143059    False
143060    False
143061    False
143062    False
143063    False
Length: 143064, dtype: bool