In [410]:
from pathlib import Path
import pandas as pd

data_root = Path("../kkdata3")
for x in data_root.glob("*"):
    print(x)

train_target = pd.read_parquet(data_root / "label_train_target.parquet")
test_source = pd.read_parquet(data_root / "label_test_source.parquet")
meta_song = pd.read_parquet(data_root / "meta_song.parquet")

../kkdata3/predict_result.parquet
../kkdata3/label_train_source.parquet
../kkdata3/label_train_target.parquet
../kkdata3/label_test_source.parquet
../kkdata3/meta_song_titletext.parquet
../kkdata3/sample.csv
../kkdata3/meta_song_lyricist.parquet
../kkdata3/meta_song_producer.parquet
../kkdata3/meta_song.parquet
../kkdata3/meta_song_genre.parquet
../kkdata3/meta_song_composer.parquet


In [411]:
train_source = pd.read_parquet(data_root / "label_train_source.parquet")
train_source.sort_values(["session_id", "listening_order"], inplace=True)

In [412]:
def getTrainData(df, n=2):
    df = df.copy()
    # gen n song id be the dataset
    for i in range(1, n + 1):
        df[f"next{i}_song_id"] = df["song_id"].shift(-i)

    # check if last song id is in the same session
    df[f"next{n}_session_id"] = df["session_id"].shift(-n)
    df = df.query(f"session_id == next{n}_session_id")

    # only get the song_id and next1_song_id, next2_song_id, next3_song_id... column
    df = df[["song_id"] + [f"next{i}_song_id" for i in range(1, n + 1)]]

    return df

In [413]:
# calculate ngram frequency.
# n=1 is the most frequent of song_id
# n=2 is the most frequent of [song_id and next1_song_id], get the result from next2_song_id column
def getFreq(df, n=2):
    df_train = getTrainData(df, n)
    # calculate ngram frequency
    df_freq = (
        df_train.groupby(["song_id"] + [f"next{i}_song_id" for i in range(1, n)])
        .value_counts(sort=True, normalize=True)
        .reset_index(name="freq")
    )

    # get the most frequent song_id, sort by freq
    df_freq = (
        df_freq.sort_values(
            ["song_id"] + [f"next{i}_song_id" for i in range(1, n)] + ["freq"],
            ascending=False,
        )
        .groupby(["song_id"] + [f"next{i}_song_id" for i in range(1, n)])
        .head(1)
    )
    return df_freq

In [414]:
# get testX dataset
def getTestX(df, n=2):
    trainX = pd.DataFrame()
    # if n == 3:, get listening_order : 18, 19, 20
    for i in range(n, 0, -1):
        _ = df.query(f"listening_order == {20-i+1}")[
            ["session_id", "song_id"]
        ].set_index("session_id")
        if trainX.empty:
            trainX = _
        else:
            trainX = trainX.join(_, lsuffix=f"_{n-i-1}")

    trainX = trainX.rename(columns={trainX.columns[-1]: f"song_id_{n-1}"})

    # trainX keep session_id
    trainX.reset_index(inplace=True)
    trainX.rename_axis("session_id", axis=1, inplace=True)
    return trainX


testX = getTestX(test_source, n=2)
testX.head(5)

session_id,session_id.1,song_id_0,song_id_1
0,598,8cd8008ddffd9b2d12b6b7bf17474e9e,e8bbc37dee09b8e548f0e2137dd1c2a5
1,1039,1ac480c8add024f6febd936dac34bf82,308f5f2b339b3ff74c9b60b49832d27f
2,1199,3aa676e51e8958a78394f6b6a07c2cea,d1492ba4a2fdf555510934e9740ea8c3
3,1489,e960bf0e27715aeae5620e6c60dead5f,4c19e9de0a90902ccd3b55374942fc5d
4,1868,337f6da1c3f672d989537305209c5347,4a12c85cadde3aaee76be944624a3516


In [415]:
# predict the next song id by ngram frequency table
def predict(df, freqTable, n=2):
    df = pd.merge(
        df,
        freqTable,
        how="left",
        left_on=[f"song_id_{i}" for i in range(n)],
        right_on=["song_id"] + [f"next{_}_song_id" for _ in range(1, n)],
    )
    for i in range(1, n):
        del df[f"next{i}_song_id"]
    del df["song_id"]
    df.rename(columns={f"next{n}_song_id": f"next_song_id"}, inplace=True)
    return df

In [416]:
# merge the predicted result and new way freq table
def merge2freq(df, freq_table):
    # offset=2 : session_id, freq, next_song_id
    df_offset = 3
    # freq_offset: next_song_id, freq
    freq_offset = 2

    # Delete the unneed column:freq
    if "freq" in df.columns:
        df = df.drop(columns=["freq"])
        df_offset -= 1
    if "freq" in freq_table.columns:
        freq_table = freq_table.drop(columns=["freq"])
        freq_offset -= 1

    fomr_count = df.shape[1] - df_offset  # 2gram : 5 -3 = 2
    latr_count = freq_table.shape[1] - freq_offset  # 1gram freq: 3 - 2 = 1

    left_on = [f"song_id_{i+(fomr_count-latr_count)}" for i in range(latr_count)]
    right_on = ["song_id"] + [f"next{i+1}_song_id" for i in range(latr_count - 1)]
    print("Left on:", left_on)
    print("Right on:", right_on)

    print("r:" + f"song_id_{i}" for i in range(latr_count))
    merged = pd.merge(
        df,
        freq_table,
        how="left",
        left_on=[f"song_id_{i+(fomr_count-latr_count)}" for i in range(latr_count)],
        right_on=["song_id"] + [f"next{i+1}_song_id" for i in range(latr_count - 1)],
    )

    freq_target_name = f"next{latr_count}_song_id"
    merged["next_song_id"].fillna(merged[freq_target_name], inplace=True)

    merged = merged.drop(columns=right_on + [freq_target_name])

    return merged

# experiments

In [417]:
trainX = pd.concat([train_source, test_source], axis=0, ignore_index=True)

In [418]:
n = 1
testX = getTestX(test_source, n)
freq1 = getFreq(trainX, n)
result1 = predict(testX, freq1, n)
result1.head()

Unnamed: 0,session_id,song_id_0,next_song_id,freq
0,598,e8bbc37dee09b8e548f0e2137dd1c2a5,e8bbc37dee09b8e548f0e2137dd1c2a5,0.04993
1,1039,308f5f2b339b3ff74c9b60b49832d27f,ef587582918ea93602e08c29ffeb47c0,0.357143
2,1199,d1492ba4a2fdf555510934e9740ea8c3,d1492ba4a2fdf555510934e9740ea8c3,0.097738
3,1489,4c19e9de0a90902ccd3b55374942fc5d,6cfb851242760855d0c85c9ca089c8c5,0.5
4,1868,4a12c85cadde3aaee76be944624a3516,337f6da1c3f672d989537305209c5347,0.069952


In [449]:
n = 2
testX = getTestX(test_source, n)
freq2 = getFreq(trainX, n)
result2 = predict(testX, freq2, n)

In [450]:
n = 2
testX = getTestX(test_source, n)
freq2 = getFreq(trainX, n)
result2 = predict(testX, freq2, n)
# result2.head()

# 2 1
result2_1 = merge2freq(result2, freq1)
result2_1.head()

Left on: ['song_id_1']
Right on: ['song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f3a5504aff0>


Unnamed: 0,session_id,song_id_0,song_id_1,next_song_id
0,598,8cd8008ddffd9b2d12b6b7bf17474e9e,e8bbc37dee09b8e548f0e2137dd1c2a5,aad06498fcd9699bee8bfb6bd9181943
1,1039,1ac480c8add024f6febd936dac34bf82,308f5f2b339b3ff74c9b60b49832d27f,ef587582918ea93602e08c29ffeb47c0
2,1199,3aa676e51e8958a78394f6b6a07c2cea,d1492ba4a2fdf555510934e9740ea8c3,d1492ba4a2fdf555510934e9740ea8c3
3,1489,e960bf0e27715aeae5620e6c60dead5f,4c19e9de0a90902ccd3b55374942fc5d,6cfb851242760855d0c85c9ca089c8c5
4,1868,337f6da1c3f672d989537305209c5347,4a12c85cadde3aaee76be944624a3516,269ef1b3c7933d8de1d8dcb834630a4c


In [451]:
n = 3
testX = getTestX(test_source, n)
freq3 = getFreq(trainX, n)
result3 = predict(testX, freq3, n)
# result3.head()


# 3 2
result3_2 = merge2freq(result3, freq2)
# [3,2] 1
result3_2_1 = merge2freq(result3_2, freq1)
# [3,2,1] 20
result3_2_1["next_song_id"].fillna(result3_2_1["song_id_2"], inplace=True)
result = result3_2_1
result.head()

Left on: ['song_id_1', 'song_id_2']
Right on: ['song_id', 'next1_song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f3a55049bd0>
Left on: ['song_id_1']
Right on: ['song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f3a55049bd0>


Unnamed: 0,session_id,song_id_0,song_id_1,song_id_2,next_song_id
0,598,ea91150ebda0a80c473732c71f6a524f,8cd8008ddffd9b2d12b6b7bf17474e9e,e8bbc37dee09b8e548f0e2137dd1c2a5,aad06498fcd9699bee8bfb6bd9181943
1,1039,c8752f1ed50be0798e275b8114081c89,1ac480c8add024f6febd936dac34bf82,308f5f2b339b3ff74c9b60b49832d27f,1ac480c8add024f6febd936dac34bf82
2,1199,19068da38b84d254013e84e30bf65354,3aa676e51e8958a78394f6b6a07c2cea,d1492ba4a2fdf555510934e9740ea8c3,4aafdef164ce16bd8f6cefec25d01ca3
3,1489,9ade5d5ff09e2c399bfc75237f0ed590,e960bf0e27715aeae5620e6c60dead5f,4c19e9de0a90902ccd3b55374942fc5d,4c19e9de0a90902ccd3b55374942fc5d
4,1868,9c6ce0c23159646a6acf5dd8e9f39a19,337f6da1c3f672d989537305209c5347,4a12c85cadde3aaee76be944624a3516,269ef1b3c7933d8de1d8dcb834630a4c


In [454]:
n = 5
testX = getTestX(test_source, n)
freq5 = getFreq(trainX, n)
result5 = predict(testX, freq5, n)
# result3.head()


# 5 2
result5_2 = merge2freq(result5, freq2)
# [5,2] 20
result5_2["next_song_id"].fillna(result5_2["song_id_4"], inplace=True)
result = result5_2
result.head()

In [452]:
nan_count = result["next_song_id"].isna().sum()
print(nan_count / result.shape[0])

0.0


### Drop testX input

In [453]:
n = 5
drop_cols = [f"song_id_{i}" for i in range(n)]
result.drop(columns=drop_cols, inplace=True)

result.rename(columns={"next_song_id": "top1"}, inplace=True)

KeyError: "['song_id_3', 'song_id_4'] not found in axis"

### Duplicate Top1 for top2, top3, ... , top5

In [None]:
for i in range(2, 6):
    result[f"top{i}"] = result["top1"]
result.head()

Unnamed: 0,session_id,top1,top2,top3,top4,top5
0,598,aad06498fcd9699bee8bfb6bd9181943,aad06498fcd9699bee8bfb6bd9181943,aad06498fcd9699bee8bfb6bd9181943,aad06498fcd9699bee8bfb6bd9181943,aad06498fcd9699bee8bfb6bd9181943
1,1039,1ac480c8add024f6febd936dac34bf82,1ac480c8add024f6febd936dac34bf82,1ac480c8add024f6febd936dac34bf82,1ac480c8add024f6febd936dac34bf82,1ac480c8add024f6febd936dac34bf82
2,1199,4aafdef164ce16bd8f6cefec25d01ca3,4aafdef164ce16bd8f6cefec25d01ca3,4aafdef164ce16bd8f6cefec25d01ca3,4aafdef164ce16bd8f6cefec25d01ca3,4aafdef164ce16bd8f6cefec25d01ca3
3,1489,4c19e9de0a90902ccd3b55374942fc5d,4c19e9de0a90902ccd3b55374942fc5d,4c19e9de0a90902ccd3b55374942fc5d,4c19e9de0a90902ccd3b55374942fc5d,4c19e9de0a90902ccd3b55374942fc5d
4,1868,269ef1b3c7933d8de1d8dcb834630a4c,269ef1b3c7933d8de1d8dcb834630a4c,269ef1b3c7933d8de1d8dcb834630a4c,269ef1b3c7933d8de1d8dcb834630a4c,269ef1b3c7933d8de1d8dcb834630a4c


### Output to csv file

In [None]:
result.to_csv("results/result.csv", index=False)