In [None]:
import pandas as pd
from pathlib import Path
import polars as pl

data_root = Path("../kkdata3/")
train_source = pd.read_parquet(data_root / "label_train_source.parquet")
train_target = pd.read_parquet(data_root / "label_train_target.parquet")
test_source = pd.read_parquet(data_root / "label_test_source.parquet")
meta_song = pd.read_parquet(data_root / "meta_song.parquet")
meta_song_composer = pd.read_parquet(data_root / "meta_song_composer.parquet")
meta_song_genre = pd.read_parquet(data_root / "meta_song_genre.parquet")
meta_song_lyricist = pd.read_parquet(data_root / "meta_song_lyricist.parquet")
meta_song_producer = pd.read_parquet(data_root / "meta_song_producer.parquet")
meta_song_titletext = pd.read_parquet(data_root / "meta_song_titletext.parquet")


train_source.dtypes

In [None]:
train_source.sort_values(["session_id", "listening_order"], inplace=True)
train_target.sort_values(["session_id", "listening_order"], inplace=True)
# test_source.sort_values(["session_id", "listening_order"], inplace=True)

In [None]:
# map song_id to song_index to save memory and speed up
meta_song["song_index"] = meta_song.index
train_source = train_source.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)
train_target = train_target.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)
test_source = test_source.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)

In [None]:
from datetime import datetime

train_source["unix_played_at"] = pd.to_datetime(
    train_source["unix_played_at"], unit="s"
)
train_target["unix_played_at"] = pd.to_datetime(
    train_target["unix_played_at"], unit="s"
)
test_source["unix_played_at"] = pd.to_datetime(test_source["unix_played_at"], unit="s")

## simple train a ML model - 20 predict 1

follow ramdomforest, language is not important.

In [None]:
# prepare train data
# add artist & language column
_language = train_source.merge(
    meta_song[["song_index", "language_id"]], on="song_index", how="left"
)
_language_artist = _language.merge(
    meta_song[["song_id", "artist_id"]], on="song_id", how="left"
)
df_train_x = _language_artist

# Calculate the play duration, set 20th song's play duration to NaN
df_train_x["play_duration"] = df_train_x.groupby("session_id")["unix_played_at"].diff()
df_train_x["play_duration"] = df_train_x["play_duration"].shift(-1)

# check each session has 20 songs
print(
    "check session has 20 songs:",
    (df_train_x["listening_order"] == 20).sum() == len(df_train_x) // 20,
)
df_train_x = df_train_x.drop(columns=["song_id", "login_type", "unix_played_at"])
df_train_x.head(25)

In [None]:
# handle nan
df_train_x["artist_id"].fillna(0, inplace=True)
df_train_x["language_id"].fillna(0, inplace=True)
df_train_x

In [None]:
# Transform the data format to be model training friendly

df = df_train_x.copy()

# step1 - data cleaning
df["play_duration"] = df["play_duration"].dt.total_seconds()  # 將 play_duration 轉換為秒
df["play_duration"].fillna(0, inplace=True)

# # step2 - feature engineering
# # 為每首歌創建特徵（假設 cols 是相關列）
features = ["song_index", "artist_id", "play_duration"]

new_df = pd.DataFrame()
for i in range(1, 21):  # 迴圈處理 20 首歌
    for feature in features:
        user_feature_name = f"{feature}_{i}"
        user_feature_values = df[df["listening_order"] == i][feature].values
        new_df[user_feature_name] = user_feature_values

train_x = new_df
train_x.head(5)

In [None]:
mask_21 = train_target[train_target["listening_order"] == 21].reset_index()
train_y = mask_21[["song_index"]]
train_y

### use top10 playduration predict 1-> ramdomforest.


In [None]:
import pandas as pd

# Assuming train_x is a DataFrame

topk = 5
train_x_k_rows = []

# Iterate over rows using iterrows
for idx, row in train_x.iterrows():
    # Get topk play duration indices
    playdurations = [row[f"play_duration_{i}"] for i in range(1, 21)]
    topk_indices = sorted(
        range(len(playdurations)), key=lambda i: playdurations[i], reverse=True
    )[:topk]

    # Create a new row dictionary
    new_row = {}
    for i, idx in enumerate(topk_indices, start=1):
        new_row[f"song_index_{i}"] = row[f"song_index_{idx + 1}"]
        new_row[f"play_duration_{i}"] = row[f"play_duration_{idx + 1}"]
        new_row[f"artist_id_{i}"] = row[f"artist_id_{idx + 1}"]

    # Append the new row to the list
    train_x_k_rows.append(new_row)

# Create a DataFrame from the list of rows
train_x_10 = pd.DataFrame(train_x_k_rows)

# Display the resulting DataFrame
train_x_10

In [None]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_x, train_y, test_size=0.1, random_state=42
)
model = ensemble.RandomForestClassifier(n_jobs=-1)

In [15]:
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

predict_01 = model.predict(X_train)
# Calculate precision and recall from the confusion matrix
precision = precision_score(
    X_train, predict_01, average="weighted"
)  # 'weighted' for multiclass problems
recall = recall_score(y_train, predict_01, average="weighted")


print("Precision:", precision)
print("Recall:", recall)

In [None]:
# from matplotlib import pyplot as plt
# import seaborn as sns

# df_plot = pd.DataFrame(
#     {"features": train_x.columns[:], "importances": model.feature_importances_}
# )
# df_plot = df_plot.sort_values("importances", ascending=False)
# plt.figure(figsize=[11, 20])
# sns.barplot(x=df_plot.importances, y=df_plot.features)
# plt.title("Importances of Features Plot")
# plt.show()

In [None]:
# from catboost import CatBoostClassifier
# from sklearn.metrics import confusion_matrix
# from sklearn.model_selection import train_test_split


# X_train, X_test, y_train, y_test = train_test_split(
#     train_x, train_y, test_size=0.95, random_state=42
# )
# model_01 = CatBoostClassifier(
#     iterations=1,
# )
# model_01.fit(X_train, y_train["song_index"])


# predict_01 = model_01.predict(X_test)
# # Generate the confusion matrix
# cm1 = confusion_matrix(y_test, predict_01)

# # Print the confusion matrix
# print("Confusion Matrix:")
# print(cm1)