In [1]:
from pathlib import Path
import pandas as pd

data_root = Path("../kkdata3")
for x in data_root.glob("*"):
    print(x)

train_target = pd.read_parquet(data_root / "label_train_target.parquet")
test_source = pd.read_parquet(data_root / "label_test_source.parquet")
meta_song = pd.read_parquet(data_root / "meta_song.parquet")

../kkdata3/predict_result.parquet
../kkdata3/label_train_source.parquet
../kkdata3/label_train_target.parquet
../kkdata3/label_test_source.parquet
../kkdata3/meta_song_titletext.parquet
../kkdata3/sample.csv
../kkdata3/meta_song_lyricist.parquet
../kkdata3/meta_song_producer.parquet
../kkdata3/meta_song.parquet
../kkdata3/meta_song_genre.parquet
../kkdata3/meta_song_composer.parquet


In [2]:
train_source = pd.read_parquet(data_root / "label_train_source.parquet")
train_source.sort_values(["session_id", "listening_order"], inplace=True)
test_source.sort_values(["session_id", "listening_order"], inplace=True)

In [3]:
def getTrainData(df, n=2):
    df = df.copy()
    # gen n song id be the dataset
    for i in range(1, n + 1):
        df[f"next{i}_song_id"] = df["song_id"].shift(-i)

    # check if last song id is in the same session
    df[f"next{n}_session_id"] = df["session_id"].shift(-n)
    df = df.query(f"session_id == next{n}_session_id")

    # only get the song_id and next1_song_id, next2_song_id, next3_song_id... column
    df = df[["song_id"] + [f"next{i}_song_id" for i in range(1, n + 1)]]

    return df

In [4]:
# calculate ngram frequency.
# n=1 is the most frequent of song_id
# n=2 is the most frequent of [song_id and next1_song_id], get the result from next2_song_id column
def getFreq(df, n=2):
    df_train = getTrainData(df, n)
    # calculate ngram frequency
    df_freq = (
        df_train.groupby(["song_id"] + [f"next{i}_song_id" for i in range(1, n)])
        .value_counts(sort=True, normalize=True)
        .reset_index(name="freq")
    )

    # get the most frequent song_id, sort by freq
    df_freq = (
        df_freq.sort_values(
            ["song_id"] + [f"next{i}_song_id" for i in range(1, n)] + ["freq"],
            ascending=False,
        )
        .groupby(["song_id"] + [f"next{i}_song_id" for i in range(1, n)])
        .head(1)
    )
    return df_freq

In [5]:
# get testX dataset
def getTestX(df, n=2):
    trainX = pd.DataFrame()
    # if n == 3:, get listening_order : 18, 19, 20
    for i in range(n, 0, -1):
        _ = df.query(f"listening_order == {20-i+1}")[
            ["session_id", "song_id"]
        ].set_index("session_id")
        if trainX.empty:
            trainX = _
        else:
            trainX = trainX.join(_, lsuffix=f"_{n-i-1}")

    trainX = trainX.rename(columns={trainX.columns[-1]: f"song_id_{n-1}"})

    # trainX keep session_id
    trainX.reset_index(inplace=True)
    trainX.rename_axis("session_id", axis=1, inplace=True)
    return trainX


testX = getTestX(test_source, n=2)
testX.head(5)

session_id,session_id.1,song_id_0,song_id_1
0,8,7de0070eb49ecc7bf33bf0c20a4c17ce,d3fdb7035deadad395052b7a3bd44d9c
1,9,d1783056f15730a5ca9da967d9330a90,7653de936f26c9b71c728e88cdd29c1a
2,18,4cea5305fb9b1f4d2b3c2cc07eb7b8e7,72223dd61bc581d44e204b2c0df24c80
3,19,6c33e2032630dcafde3a4fd5ae196dc2,c274a9feefa025419288477eff4c8320
4,28,95bf93c6eab031c297effa69bc9324ed,20171b381e293bda502851778947ce57


In [6]:
# predict the next song id by ngram frequency table
def predict(df, freqTable, n=2):
    df = pd.merge(
        df,
        freqTable,
        how="left",
        left_on=[f"song_id_{i}" for i in range(n)],
        right_on=["song_id"] + [f"next{_}_song_id" for _ in range(1, n)],
    )
    for i in range(1, n):
        del df[f"next{i}_song_id"]
    del df["song_id"]
    df.rename(columns={f"next{n}_song_id": f"next_song_id"}, inplace=True)
    return df

In [7]:
# merge the predicted result and new way freq table
def merge2freq(df, freq_table):
    # offset=2 : session_id, freq, next_song_id
    df_offset = 3
    # freq_offset: next_song_id, freq
    freq_offset = 2

    # Delete the unneed column:freq
    if "freq" in df.columns:
        df = df.drop(columns=["freq"])
        df_offset -= 1
    if "freq" in freq_table.columns:
        freq_table = freq_table.drop(columns=["freq"])
        freq_offset -= 1

    fomr_count = df.shape[1] - df_offset  # 2gram : 5 -3 = 2
    latr_count = freq_table.shape[1] - freq_offset  # 1gram freq: 3 - 2 = 1

    left_on = [f"song_id_{i+(fomr_count-latr_count)}" for i in range(latr_count)]
    right_on = ["song_id"] + [f"next{i+1}_song_id" for i in range(latr_count - 1)]
    print("Left on:", left_on)
    print("Right on:", right_on)

    print("r:" + f"song_id_{i}" for i in range(latr_count))
    merged = pd.merge(
        df,
        freq_table,
        how="left",
        left_on=[f"song_id_{i+(fomr_count-latr_count)}" for i in range(latr_count)],
        right_on=["song_id"] + [f"next{i+1}_song_id" for i in range(latr_count - 1)],
    )

    freq_target_name = f"next{latr_count}_song_id"
    merged["next_song_id"].fillna(merged[freq_target_name], inplace=True)

    merged = merged.drop(columns=right_on + [freq_target_name])

    return merged

# experiments

In [8]:
trainX = pd.concat([train_source, test_source], axis=0, ignore_index=True)

In [9]:
n = 1
testX = getTestX(test_source, n)
freq1 = getFreq(trainX, n)
result1 = predict(testX, freq1, n)
result1.head()

Unnamed: 0,session_id,song_id_0,next_song_id,freq
0,8,d3fdb7035deadad395052b7a3bd44d9c,254700cdaf28bff608018648d4bcf4ed,0.040752
1,9,7653de936f26c9b71c728e88cdd29c1a,976fc825bfe7266c216fe34a85a3ac1b,0.053136
2,18,72223dd61bc581d44e204b2c0df24c80,72223dd61bc581d44e204b2c0df24c80,0.089706
3,19,c274a9feefa025419288477eff4c8320,391e076cbfe516de72c7ea9f6e9cb3b5,0.113636
4,28,20171b381e293bda502851778947ce57,20171b381e293bda502851778947ce57,0.045045


In [10]:
n = 2
testX = getTestX(test_source, n)
freq2 = getFreq(trainX, n)
result2 = predict(testX, freq2, n)

In [20]:
n = 2
testX = getTestX(test_source, n)
freq2 = getFreq(trainX, n)
result2 = predict(testX, freq2, n)
# result2.head()

# 2 1
result2_1 = merge2freq(result2, freq1)
result2_1.head()

Left on: ['song_id_1']
Right on: ['song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f3a3ad1d2a0>


Unnamed: 0,session_id,song_id_0,song_id_1,next_song_id
0,8,7de0070eb49ecc7bf33bf0c20a4c17ce,d3fdb7035deadad395052b7a3bd44d9c,76723b980445c7c7b1350ca038348bbb
1,9,d1783056f15730a5ca9da967d9330a90,7653de936f26c9b71c728e88cdd29c1a,976fc825bfe7266c216fe34a85a3ac1b
2,18,4cea5305fb9b1f4d2b3c2cc07eb7b8e7,72223dd61bc581d44e204b2c0df24c80,9cb10772a82000f45de4a950883df945
3,19,6c33e2032630dcafde3a4fd5ae196dc2,c274a9feefa025419288477eff4c8320,d6813a67f316942de54f967fda79abb2
4,28,95bf93c6eab031c297effa69bc9324ed,20171b381e293bda502851778947ce57,20171b381e293bda502851778947ce57


In [12]:
n = 3
testX = getTestX(test_source, n)
freq3 = getFreq(trainX, n)
result3 = predict(testX, freq3, n)
# result3.head()


# 3 2
result3_2 = merge2freq(result3, freq2)
# [3,2] 1
result3_2_1 = merge2freq(result3_2, freq1)
# [3,2,1] 20
result3_2_1["next_song_id"].fillna(result3_2_1["song_id_2"], inplace=True)
result = result3_2_1
result.head()

Left on: ['song_id_1', 'song_id_2']
Right on: ['song_id', 'next1_song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f3a3ad1cb30>
Left on: ['song_id_1']
Right on: ['song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f3a3ad1cb30>


Unnamed: 0,session_id,song_id_0,song_id_1,song_id_2,next_song_id
0,8,cade012f15f78294211963d544c134af,7de0070eb49ecc7bf33bf0c20a4c17ce,d3fdb7035deadad395052b7a3bd44d9c,76723b980445c7c7b1350ca038348bbb
1,9,c8e195d50d1c408988b0eb230b643b94,d1783056f15730a5ca9da967d9330a90,7653de936f26c9b71c728e88cdd29c1a,d1783056f15730a5ca9da967d9330a90
2,18,8c28e58fc45a601217620786256639da,4cea5305fb9b1f4d2b3c2cc07eb7b8e7,72223dd61bc581d44e204b2c0df24c80,9cb10772a82000f45de4a950883df945
3,19,394eb63a48c8ca50e20b8c694b850697,6c33e2032630dcafde3a4fd5ae196dc2,c274a9feefa025419288477eff4c8320,d6813a67f316942de54f967fda79abb2
4,28,e895569422b4073e5099809115f41b2e,95bf93c6eab031c297effa69bc9324ed,20171b381e293bda502851778947ce57,95bf93c6eab031c297effa69bc9324ed


In [78]:
n = 5
testX = getTestX(test_source, n)
freq5 = getFreq(trainX, n)
result5 = predict(testX, freq5, n)

# 5 2
result5_2 = merge2freq(result5, freq2)

# [5,2] 20
result5_2["next_song_id"].fillna(result5_2["song_id_4"], inplace=True)
result = result5_2
result.head()

Left on: ['song_id_3', 'song_id_4']
Right on: ['song_id', 'next1_song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f3982028820>


Unnamed: 0,session_id,song_id_0,song_id_1,song_id_2,song_id_3,song_id_4,next_song_id
0,8,8eded282bd2e1ee31318b9934371954f,5aba5f02d054be2c82483cd905eb79d2,cade012f15f78294211963d544c134af,7de0070eb49ecc7bf33bf0c20a4c17ce,d3fdb7035deadad395052b7a3bd44d9c,76723b980445c7c7b1350ca038348bbb
1,9,e78a27ecf0a1c4e2622a86e4c076899c,7f8e8658173de614cf0c3916c480ab8b,c8e195d50d1c408988b0eb230b643b94,d1783056f15730a5ca9da967d9330a90,7653de936f26c9b71c728e88cdd29c1a,7653de936f26c9b71c728e88cdd29c1a
2,18,43ff50171dcf94f2ad0e775ef1a6c707,10263606ed4efa0e3b7286d777bc9823,8c28e58fc45a601217620786256639da,4cea5305fb9b1f4d2b3c2cc07eb7b8e7,72223dd61bc581d44e204b2c0df24c80,9cb10772a82000f45de4a950883df945
3,19,6c33e2032630dcafde3a4fd5ae196dc2,82209b5d6bc192d7052ba3da4230aeed,394eb63a48c8ca50e20b8c694b850697,6c33e2032630dcafde3a4fd5ae196dc2,c274a9feefa025419288477eff4c8320,d6813a67f316942de54f967fda79abb2
4,28,2e8197b1f9339c4173846eb8ccd2e25b,b4d690a5e4ad23026128fdc5f0693f24,e895569422b4073e5099809115f41b2e,95bf93c6eab031c297effa69bc9324ed,20171b381e293bda502851778947ce57,20171b381e293bda502851778947ce57


In [14]:
# n = 6
# testX = getTestX(test_source, n)
# freq6 = getFreq(trainX, n)
# result = predict(testX, freq6, n)
# # result3.head()


# # 6 2
# result = merge2freq(result, freq2)
# # [6,2] 20
# result["next_song_id"].fillna(result["song_id_5"], inplace=True)
# result.head()

In [79]:
nan_count = result["next_song_id"].isna().sum()
print(nan_count / result.shape[0])

0.0


### Drop testX input

In [81]:
n = 5
drop_cols = [f"song_id_{i}" for i in range(n)]
result.drop(columns=drop_cols, inplace=True)

result.rename(columns={"next_song_id": "top1"}, inplace=True)

KeyError: "['song_id_0', 'song_id_1', 'song_id_2', 'song_id_3', 'song_id_4'] not found in axis"

### Duplicate Top1 to top2, top3, ... , top5

In [82]:
for i in range(2, 6):
    result[f"top{i}"] = result["top1"]
result.head()

Unnamed: 0,session_id,top1,top2,top3,top4,top5
0,8,76723b980445c7c7b1350ca038348bbb,76723b980445c7c7b1350ca038348bbb,76723b980445c7c7b1350ca038348bbb,76723b980445c7c7b1350ca038348bbb,76723b980445c7c7b1350ca038348bbb
1,9,7653de936f26c9b71c728e88cdd29c1a,7653de936f26c9b71c728e88cdd29c1a,7653de936f26c9b71c728e88cdd29c1a,7653de936f26c9b71c728e88cdd29c1a,7653de936f26c9b71c728e88cdd29c1a
2,18,9cb10772a82000f45de4a950883df945,9cb10772a82000f45de4a950883df945,9cb10772a82000f45de4a950883df945,9cb10772a82000f45de4a950883df945,9cb10772a82000f45de4a950883df945
3,19,d6813a67f316942de54f967fda79abb2,d6813a67f316942de54f967fda79abb2,d6813a67f316942de54f967fda79abb2,d6813a67f316942de54f967fda79abb2,d6813a67f316942de54f967fda79abb2
4,28,20171b381e293bda502851778947ce57,20171b381e293bda502851778947ce57,20171b381e293bda502851778947ce57,20171b381e293bda502851778947ce57,20171b381e293bda502851778947ce57


### Ramdom song id to top2~top5

0    13a0d7c29fb7ae959490de8aad06fc9b
1    5c23378f8f6afde160d9574ea032a2dc
2    c7a767dc58fd386101b0083c1f7bc436
3    7901ae39c95d050bf839814c473cdf74
4    0ec0be2fdcfbe6400ef7970d9dc86072
Name: song_id, dtype: object

### Ramdom pick 22, 23, 24, 25

In [90]:
res = result.copy()
for i in range(2, 6):
    _sample = meta_song.sample(result.shape[0], replace=True)["song_id"].reset_index(
        drop=True
    )
    res[f"top{i}"] = pd.Series(_sample)
res.head()

Unnamed: 0,session_id,top1,top2,top3,top4,top5
0,8,76723b980445c7c7b1350ca038348bbb,05dfa1b3292bb0c292064af485023f68,123c6c9b9254949992828a104a8356a0,65a0fd51010043eee34494dcf871b668,e66d7e189b8207ff8c34e387bc5f9097
1,9,7653de936f26c9b71c728e88cdd29c1a,39db873f6fb7bc03964713696dee3b62,5134620d5b0b0b857d088eb36ea101a5,0b25e94e4da09f038c7d909a3d6dba41,da4870c9985d3e793451155a02e390b7
2,18,9cb10772a82000f45de4a950883df945,67abab239b0b8ab7c8969fc03901ae89,026538db9d8b5b875557b4993141bd59,d520deb17aeb41694328bd73c6d981ce,cd0fe5b308b4145981bdbb642dc9de79
3,19,d6813a67f316942de54f967fda79abb2,c9f13b21529d72817a54995ca56c928c,e632bb7839490541e629b1b3ffa25bf7,f37039e9fefbef75d3e67dfb6076c2b3,2023aa2bb970bb384bb10d8c4668b4bb
4,28,20171b381e293bda502851778947ce57,b7c53fc1274769ef4c321ded67530233,9768b83ebd812f6032f2cf70b04fda07,1f194f3318893a6a22b145ffd78982b1,8748f172dcecab30b7ae204cb5e9f9ca


### Output to csv file

In [25]:
result.to_csv("results/result.csv", index=False)