In [1]:
import pandas as pd
from pathlib import Path
import polars as pl
import joblib
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from tqdm import tqdm

data_root = Path("../kkdata3/")
train_source = pd.read_parquet(data_root / "label_train_source.parquet")
train_target = pd.read_parquet(data_root / "label_train_target.parquet")
test_source = pd.read_parquet(data_root / "label_test_source.parquet")
meta_song = pd.read_parquet(data_root / "meta_song.parquet")
meta_song_composer = pd.read_parquet(data_root / "meta_song_composer.parquet")
meta_song_genre = pd.read_parquet(data_root / "meta_song_genre.parquet")
meta_song_lyricist = pd.read_parquet(data_root / "meta_song_lyricist.parquet")
meta_song_producer = pd.read_parquet(data_root / "meta_song_producer.parquet")
meta_song_titletext = pd.read_parquet(data_root / "meta_song_titletext.parquet")


train_source.dtypes

session_id          int64
song_id            object
unix_played_at      int64
play_status         int64
login_type          int64
listening_order     int64
dtype: object

In [2]:
train_source.sort_values(["session_id", "listening_order"], inplace=True)
train_target.sort_values(["session_id", "listening_order"], inplace=True)
# test_source.sort_values(["session_id", "listening_order"], inplace=True)

In [3]:
# map song_id to song_index to save memory and speed up
meta_song["song_index"] = meta_song.index
train_source = train_source.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)
train_target = train_target.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)
test_source = test_source.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)

In [4]:
from datetime import datetime

train_source["unix_played_at"] = pd.to_datetime(
    train_source["unix_played_at"], unit="s"
)
train_target["unix_played_at"] = pd.to_datetime(
    train_target["unix_played_at"], unit="s"
)
test_source["unix_played_at"] = pd.to_datetime(test_source["unix_played_at"], unit="s")

## simple train a ML model - 20 predict 1

follow ramdomforest, language is not important.

In [5]:
# prepare train data
# add artist & language column
_language = train_source.merge(
    meta_song[["song_index", "language_id"]], on="song_index", how="left"
)
_language_artist = _language.merge(
    meta_song[["song_id", "artist_id"]], on="song_id", how="left"
)
df_train_x = _language_artist

# Calculate the play duration, set 20th song's play duration to NaN
df_train_x["play_duration"] = df_train_x.groupby("session_id")["unix_played_at"].diff()
df_train_x["play_duration"] = df_train_x["play_duration"].shift(-1)

# check each session has 20 songs
print(
    "check session has 20 songs:",
    (df_train_x["listening_order"] == 20).sum() == len(df_train_x) // 20,
)
df_train_x = df_train_x.drop(columns=["song_id", "login_type", "unix_played_at"])
df_train_x.head(25)

check session has 20 songs: True


Unnamed: 0,session_id,play_status,listening_order,song_index,language_id,artist_id,play_duration
0,1,0,1,354121,3.0,2959458.0,0 days 00:03:45
1,1,0,2,1030664,,,0 days 00:39:43
2,1,0,3,642780,3.0,12719146.0,0 days 00:02:56
3,1,0,4,280721,3.0,40080.0,0 days 00:09:12
4,1,0,5,90293,3.0,55507.0,0 days 00:00:01
5,1,0,6,766515,,13429875.0,0 days 00:00:04
6,1,0,7,769968,,6651.0,0 days 00:00:00
7,1,0,8,1007853,,23054840.0,0 days 00:00:02
8,1,0,9,249587,3.0,2424531.0,0 days 00:00:02
9,1,0,10,615299,3.0,15412497.0,0 days 00:00:02


In [6]:
# handle nan
df_train_x["artist_id"].fillna(-1, inplace=True)
df_train_x["language_id"].fillna(-1, inplace=True)
df_train_x

Unnamed: 0,session_id,play_status,listening_order,song_index,language_id,artist_id,play_duration
0,1,0,1,354121,3.0,2959458.0,0 days 00:03:45
1,1,0,2,1030664,-1.0,-1.0,0 days 00:39:43
2,1,0,3,642780,3.0,12719146.0,0 days 00:02:56
3,1,0,4,280721,3.0,40080.0,0 days 00:09:12
4,1,0,5,90293,3.0,55507.0,0 days 00:00:01
...,...,...,...,...,...,...,...
11445175,715323,0,16,587574,62.0,17250041.0,0 days 00:02:55
11445176,715323,0,17,152114,52.0,9812237.0,0 days 00:04:24
11445177,715323,0,18,422873,62.0,64006.0,0 days 00:03:20
11445178,715323,0,19,935208,-1.0,4261.0,0 days 00:00:53


In [7]:
# Transform the data format to be model training friendly

df = df_train_x.copy()

# step1 - data cleaning
df["play_duration"] = df["play_duration"].dt.total_seconds()  # 將 play_duration 轉換為秒
df["play_duration"].fillna(0, inplace=True)

# # step2 - feature engineering
# # 為每首歌創建特徵（假設 cols 是相關列）
features = ["song_index", "artist_id", "play_duration"]

new_df = pd.DataFrame()
for i in range(1, 21):  # 迴圈處理 20 首歌
    for feature in features:
        user_feature_name = f"{feature}_{i}"
        user_feature_values = df[df["listening_order"] == i][feature].values
        new_df[user_feature_name] = user_feature_values

train_x = new_df
train_x.head(5)

Unnamed: 0,song_index_1,artist_id_1,play_duration_1,song_index_2,artist_id_2,play_duration_2,song_index_3,artist_id_3,play_duration_3,song_index_4,...,play_duration_17,song_index_18,artist_id_18,play_duration_18,song_index_19,artist_id_19,play_duration_19,song_index_20,artist_id_20,play_duration_20
0,354121,2959458.0,225.0,1030664,-1.0,2383.0,642780,12719146.0,176.0,280721,...,1.0,910165,501333.0,238.0,194387,28653.0,209.0,722034,10888.0,0.0
1,256188,-1.0,168.0,144246,-1.0,36.0,400761,-1.0,19.0,625467,...,4.0,307906,14811026.0,2.0,523383,206093.0,2.0,672083,-1.0,0.0
2,514974,-1.0,202.0,300949,-1.0,225.0,632246,20194835.0,201.0,353623,...,540.0,606328,26534767.0,544.0,578482,8445240.0,195.0,261314,-1.0,0.0
3,277607,859.0,0.0,219437,346.0,2.0,138361,11.0,0.0,622824,...,284.0,267047,5301.0,96.0,238820,6340.0,119.0,709693,9475.0,0.0
4,892347,30960.0,321.0,509757,8280.0,298.0,97064,208141.0,5209.0,66960,...,8649.0,482509,-1.0,1.0,77515,3370992.0,1.0,246245,9836586.0,0.0


In [8]:
mask_21 = train_target[train_target["listening_order"] == 21].reset_index()
train_y = mask_21[["song_index"]]
train_y

Unnamed: 0,song_index
0,624433
1,52476
2,203739
3,453065
4,186586
...,...
572254,615766
572255,784715
572256,681203
572257,953126


### use top10 playduration predict 1-> ramdomforest.


In [9]:
import pandas as pd

# Assuming train_x is a DataFrame

topk = 5
train_x_k_rows = []

# Iterate over rows using iterrows
for idx, row in train_x.iterrows():
    # Get topk play duration indices
    playdurations = [row[f"play_duration_{i}"] for i in range(1, 21)]
    topk_indices = sorted(
        range(len(playdurations)), key=lambda i: playdurations[i], reverse=True
    )[:topk]

    # Create a new row dictionary
    new_row = {}
    for i, idx in enumerate(topk_indices, start=1):
        new_row[f"song_index_{i}"] = row[f"song_index_{idx + 1}"].astype(int)
        # new_row[f"play_duration_{i}"] = row[f"play_duration_{idx + 1}"].astype(int)
        new_row[f"artist_id_{i}"] = row[f"artist_id_{idx + 1}"].astype(int)

    # Append the new row to the list
    train_x_k_rows.append(new_row)

# Create a DataFrame from the list of rows
train_x_10 = pd.DataFrame(train_x_k_rows)

# Display the resulting DataFrame
train_x_10

Unnamed: 0,song_index_1,artist_id_1,song_index_2,artist_id_2,song_index_3,artist_id_3,song_index_4,artist_id_4,song_index_5,artist_id_5
0,1030664,-1,280721,40080,910165,501333,354121,2959458,194387,28653
1,256188,-1,534994,2424531,360309,3194,144246,-1,400761,-1
2,139121,3372334,332935,671259,673916,-1,385637,-1,606328,26534767
3,700237,346,824980,-1,868800,3536873,622824,262,238820,6340
4,328649,18727775,589816,208141,97064,208141,892347,30960,74810,-1
...,...,...,...,...,...,...,...,...,...,...
572254,536976,-1,514985,-1,256899,-1,8113,-1,292762,39618587
572255,737589,16685032,899618,16685032,741894,16685032,829501,16685032,899618,16685032
572256,647388,50340,832786,2032,569657,24731635,953923,439,791040,183836
572257,37200,28742779,825005,-1,766052,36339003,766052,36339003,1030620,-1


test_size = 0.98 , 0.9(7%) ok 150=n, test_size = 0.4 =>86m 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    train_x_10, train_y.values.ravel(), test_size=0.4, random_state=42
)
model = ensemble.RandomForestClassifier(
    n_estimators=150,
    max_depth=10,
    min_samples_leaf=5,
    n_jobs=-1,
)

model.fit(X_train, y_train)
joblib.dump(model, "./random_forest.joblib")
print("finish")

finish


In [10]:
from sklearn.metrics import precision_score, recall_score
import numpy as np
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

X_train, X_test, y_train, y_test = train_test_split(
    train_x_10, train_y.values.ravel(), test_size=0.4, random_state=42
)
# 使用批处理方式进行预测
batch_size = 2000  # 根据需要设置批处理大小
num_samples = len(X_test)
X = X_test
y = y_test

predicted_labels = []
print("Predicting")
model = joblib.load("./random_forest.joblib")

for i in tgdm(range(0, num_samples, batch_size)):
    if num_samples < i + batch_size:
        batch_X = X[i:]
    else:
        batch_X = X[i : (i + batch_size)]
    batch_predictions = model.predict(batch_X)
    predicted_labels.extend(batch_predictions)

# 将列表转换为 NumPy 数组
predicted_labels = np.array(predicted_labels)


# 计算精度和召回率
precision = precision_score(y, predicted_labels, average="weighted")
recall = recall_score(y, predicted_labels, average="weighted")
print("Precision:", precision)
print("Recall:", recall)

Predicting


KeyboardInterrupt: 

In [None]:
# if want to use topk

# p = rfc.predict_proba(X)
# n = 3
# top_n = np.argsort(p)[:,:-n-1:-1]

In [None]:
import statsmodels.api as sm
from sklearn.datasets import make_classification

# Fit model
rf = sm.discrete.discrete_model.RandomForest(y_train, X_train)
model = rf.fit()

In [None]:
predictions = model.predict(X_test)

隨機子抽樣（Bootstrap Aggregating）:

從大型數據集中隨機抽取子集（有放回地）以創建較小的訓練集。
對每個子集獨立地訓練一個隨機森林。
通過平均（回歸問題）或投票（分類問題）來合併結果。

In [None]:
# from matplotlib import pyplot as plt
# import seaborn as sns

# df_plot = pd.DataFrame(
#     {"features": train_x.columns[:], "importances": model.feature_importances_}
# )
# df_plot = df_plot.sort_values("importances", ascending=False)
# plt.figure(figsize=[11, 20])
# sns.barplot(x=df_plot.importances, y=df_plot.features)
# plt.title("Importances of Features Plot")
# plt.show()

In [None]:
# from catboost import CatBoostClassifier
# from sklearn.metrics import confusion_matrix
# from sklearn.model_selection import train_test_split


# X_train, X_test, y_train, y_test = train_test_split(
#     train_x, train_y, test_size=0.95, random_state=42
# )
# model_01 = CatBoostClassifier(
#     iterations=1,
# )
# model_01.fit(X_train, y_train["song_index"])


# predict_01 = model_01.predict(X_test)
# # Generate the confusion matrix
# cm1 = confusion_matrix(y_test, predict_01)

# # Print the confusion matrix
# print("Confusion Matrix:")
# print(cm1)