In [2]:
from pathlib import Path
import pandas as pd

data_root = Path("../kkdata3")
for x in data_root.glob("*"):
    print(x)

train_target = pd.read_parquet(data_root / "label_train_target.parquet")
test_source = pd.read_parquet(data_root / "label_test_source.parquet")
meta_song = pd.read_parquet(data_root / "meta_song.parquet")

../kkdata3/predict_result.parquet
../kkdata3/label_train_source.parquet
../kkdata3/label_train_target.parquet
../kkdata3/label_test_source.parquet
../kkdata3/meta_song_titletext.parquet
../kkdata3/sample.csv
../kkdata3/meta_song_lyricist.parquet
../kkdata3/meta_song_producer.parquet
../kkdata3/meta_song.parquet
../kkdata3/meta_song_genre.parquet
../kkdata3/meta_song_composer.parquet


In [3]:
train_source = pd.read_parquet(data_root / "label_train_source.parquet")
train_source.sort_values(["session_id", "listening_order"], inplace=True)
train_target.sort_values(["session_id", "listening_order"], inplace=True)
# test_source.sort_values(["session_id", "listening_order"], inplace=True)

In [4]:
# return n+1 column song id
def getTrainData(df, n=2):
    df = df.copy()
    # gen n song id be the dataset
    for i in range(1, n + 1):
        df[f"next{i}_song_id"] = df["song_id"].shift(-i)

    # check if last song id is in the same session
    df[f"next{n}_session_id"] = df["session_id"].shift(-n)
    df = df.query(f"session_id == next{n}_session_id")

    # only get the song_id and next1_song_id, next2_song_id, next3_song_id... column
    df = df[["song_id"] + [f"next{i}_song_id" for i in range(1, n + 1)]]

    return df

In [5]:
src = getTrainData(train_source, 19)
tgt = getTrainData(train_target, 4)

src.reset_index(drop=True, inplace=True)
tgt.reset_index(drop=True, inplace=True)

src.shape[0] == tgt.shape[0]

True

In [6]:
src[src.duplicated()].sort_values("song_id").sort_index()
tgt[src.duplicated()].sort_values("song_id").sort_index()

Unnamed: 0,song_id,next1_song_id,next2_song_id,next3_song_id,next4_song_id
217,20e137c89079bade9c596d92757afa59,dfcce1f1ae5fed1c038830ea5f4e559b,e1517d538da361e7dc58f77bf187d00c,0a7be36a7ea8d7bcb6dc705361fac0ea,916bf51f5bce48fecda79154c93e1ce5
590,34af8c382bee0cb49c80aa94a3d832d7,34af8c382bee0cb49c80aa94a3d832d7,34af8c382bee0cb49c80aa94a3d832d7,34af8c382bee0cb49c80aa94a3d832d7,34af8c382bee0cb49c80aa94a3d832d7
617,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d
618,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d
619,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d,04cb62ee0fb4fb641e0e71d385f09a0d
...,...,...,...,...,...
572101,2575f3264b8562f85a068d8c1cb74632,2575f3264b8562f85a068d8c1cb74632,2575f3264b8562f85a068d8c1cb74632,2575f3264b8562f85a068d8c1cb74632,2575f3264b8562f85a068d8c1cb74632
572142,7b4ec012f3f5580e57add7139b1c6e02,8797a41daa53cab47d7aa0d75bd7dbb4,98e159f2fe90193c82209674b638ee1c,7b4ec012f3f5580e57add7139b1c6e02,66e6cbab82630d69acca2115157fbcba
572205,8fb82972d63dc711b8275b9330be82b6,8fb82972d63dc711b8275b9330be82b6,8fb82972d63dc711b8275b9330be82b6,8fb82972d63dc711b8275b9330be82b6,8fb82972d63dc711b8275b9330be82b6
572233,9ad9c63b533186746f04a612abca0860,7e0caaaa8bd33ad2c6e6000458e4b2b1,767730509063e5779182491d1668aac0,9fe2262287d3f0415f6e5c8999ee8a1c,fdef56717e5431c7b1c8325222f2406a


In [7]:
duplicates_mask = src.duplicated()

In [8]:
tgt_filtered = tgt[~duplicates_mask].sort_values("song_id").sort_index()
src_filtered = src[~duplicates_mask].sort_values("song_id").sort_index()
tgt_filtered.shape

(528072, 5)

In [9]:
# show duplicated index
src[src.duplicated(keep=False)].index

Index([   216,    217,    332,    589,    590,    616,    617,    618,    619,
          620,
       ...
       571868, 571869, 571870, 571872, 571971, 572101, 572142, 572205, 572233,
       572251],
      dtype='int64', length=50852)

In [10]:
# calculate ngram frequency.
# n=1 is the most frequent of song_id
# n=2 is the most frequent of [song_id and next1_song_id], get the result from next2_song_id column
def getFreq(df, n=2):
    df_train = getTrainData(df, n)
    # calculate ngram frequency
    df_freq = (
        df_train.groupby(["song_id"] + [f"next{i}_song_id" for i in range(1, n)])
        .value_counts(sort=True, normalize=True)
        .reset_index(name="freq")
    )

    # get the most frequent song_id, sort by freq
    df_freq = (
        df_freq.sort_values(
            ["song_id"] + [f"next{i}_song_id" for i in range(1, n)] + ["freq"],
            ascending=False,
        )
        .groupby(["song_id"] + [f"next{i}_song_id" for i in range(1, n)])
        .head(1)
    )
    return df_freq

In [11]:
# get testX dataset
def getTestX(df, n=2):
    trainX = pd.DataFrame()
    # if n == 3:, get listening_order : 18, 19, 20
    for i in range(n, 0, -1):
        _ = df.query(f"listening_order == {20-i+1}")[
            ["session_id", "song_id"]
        ].set_index("session_id")
        if trainX.empty:
            trainX = _
        else:
            trainX = trainX.join(_, lsuffix=f"_{n-i-1}")

    trainX = trainX.rename(columns={trainX.columns[-1]: f"song_id_{n-1}"})

    # trainX keep session_id
    trainX.reset_index(inplace=True)
    trainX.rename_axis("session_id", axis=1, inplace=True)
    return trainX


testX = getTestX(test_source, n=2)
testX.head(5)

session_id,session_id.1,song_id_0,song_id_1
0,598,8cd8008ddffd9b2d12b6b7bf17474e9e,e8bbc37dee09b8e548f0e2137dd1c2a5
1,1039,1ac480c8add024f6febd936dac34bf82,308f5f2b339b3ff74c9b60b49832d27f
2,1199,3aa676e51e8958a78394f6b6a07c2cea,d1492ba4a2fdf555510934e9740ea8c3
3,1489,e960bf0e27715aeae5620e6c60dead5f,4c19e9de0a90902ccd3b55374942fc5d
4,1868,337f6da1c3f672d989537305209c5347,4a12c85cadde3aaee76be944624a3516


In [12]:
# predict the next song id by ngram frequency table
def predict(df, freqTable, n=2):
    df = pd.merge(
        df,
        freqTable,
        how="left",
        left_on=[f"song_id_{i}" for i in range(n)],
        right_on=["song_id"] + [f"next{_}_song_id" for _ in range(1, n)],
    )
    for i in range(1, n):
        del df[f"next{i}_song_id"]
    del df["song_id"]
    df.rename(columns={f"next{n}_song_id": f"next_song_id"}, inplace=True)
    df.sort_values("session_id")
    return df

In [13]:
# merge the predicted result and new way freq table
def merge2freq(df, freq_table):
    # offset=2 : session_id, freq, next_song_id
    df_offset = 3
    # freq_offset: next_song_id, freq
    freq_offset = 2

    # Delete the unneed column:freq
    if "freq" in df.columns:
        df = df.drop(columns=["freq"])
        df_offset -= 1
    if "freq" in freq_table.columns:
        freq_table = freq_table.drop(columns=["freq"])
        freq_offset -= 1

    fomr_count = df.shape[1] - df_offset  # 2gram : 5 -3 = 2
    latr_count = freq_table.shape[1] - freq_offset  # 1gram freq: 3 - 2 = 1

    left_on = [f"song_id_{i+(fomr_count-latr_count)}" for i in range(latr_count)]
    right_on = ["song_id"] + [f"next{i+1}_song_id" for i in range(latr_count - 1)]
    print("Left on:", left_on)
    print("Right on:", right_on)

    print("r:" + f"song_id_{i}" for i in range(latr_count))
    merged = pd.merge(
        df,
        freq_table,
        how="left",
        left_on=[f"song_id_{i+(fomr_count-latr_count)}" for i in range(latr_count)],
        right_on=["song_id"] + [f"next{i+1}_song_id" for i in range(latr_count - 1)],
    )

    freq_target_name = f"next{latr_count}_song_id"
    merged["next_song_id"].fillna(merged[freq_target_name], inplace=True)

    merged = merged.drop(columns=right_on + [freq_target_name])

    return merged

# experiments

In [14]:
# map song_id to song_index and set song_idx to song to save memory and speed up
meta_song["song_index"] = meta_song.index
train_source = train_source.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)
train_target = train_target.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)
test_source = test_source.merge(
    meta_song[["song_id", "song_index"]], on="song_id", how="left"
)

train_source["song_id"] = train_source["song_index"]
train_target["song_id"] = train_target["song_index"]
test_source["song_id"] = test_source["song_index"]

del train_source["song_index"]
del train_target["song_index"]
del test_source["song_index"]

In [15]:
trainX = pd.concat([train_source, test_source], axis=0, ignore_index=True)

In [16]:
n = 1
testX = getTestX(test_source, n)
freq1 = getFreq(trainX, n)
freq1
result1 = predict(testX, freq1, n)
result1.head()

Unnamed: 0,session_id,song_id_0,next_song_id,freq
0,598,320485,320485.0,0.04993
1,1039,82103,245916.0,0.357143
2,1199,82260,82260.0,0.097738
3,1489,163516,58880.0,0.5
4,1868,736058,798178.0,0.069952


In [17]:
result1.sort_values(["song_id_0", "session_id"], inplace=True)
result1

Unnamed: 0,session_id,song_id_0,next_song_id,freq
64449,294268,21,21.0,0.049062
139062,301108,21,21.0,0.049062
117995,390969,21,21.0,0.049062
18743,541198,21,21.0,0.049062
25912,116369,39,270774.0,0.165517
...,...,...,...,...
53933,544689,1030708,736280.0,0.170886
135082,577739,1030708,736280.0,0.170886
72807,624199,1030708,736280.0,0.170886
112178,240878,1030709,915034.0,0.046632


In [18]:
n = 2
testX = getTestX(test_source, n)
freq2 = getFreq(trainX, n)
result2 = predict(testX, freq2, n)
freq2

Unnamed: 0,song_id,next1_song_id,next2_song_id,freq
10095227,1030711,967508.0,561432.0,1.0
10095226,1030711,967484.0,609494.0,1.0
10095225,1030711,946599.0,125355.0,1.0
10095224,1030711,936567.0,1023129.0,1.0
10095223,1030711,643112.0,6835.0,1.0
...,...,...,...,...
4,3,401307.0,318166.0,1.0
3,3,200709.0,1021838.0,1.0
2,3,87846.0,897082.0,1.0
1,1,1014814.0,799780.0,1.0


In [19]:
freq2.sort_values(["song_id", "next1_song_id"], inplace=True)
freq2

Unnamed: 0,song_id,next1_song_id,next2_song_id,freq
0,1,488417.0,697989.0,1.0
1,1,1014814.0,799780.0,1.0
2,3,87846.0,897082.0,1.0
3,3,200709.0,1021838.0,1.0
4,3,401307.0,318166.0,1.0
...,...,...,...,...
10095223,1030711,643112.0,6835.0,1.0
10095224,1030711,936567.0,1023129.0,1.0
10095225,1030711,946599.0,125355.0,1.0
10095226,1030711,967484.0,609494.0,1.0


In [20]:
n = 2
testX = getTestX(test_source, n)
freq2 = getFreq(trainX, n)
result2 = predict(testX, freq2, n)
# result2.head()

# 2 1
result2_1 = merge2freq(result2, freq1)
result2_1.head()

Left on: ['song_id_1']
Right on: ['song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f2b8f60c820>


Unnamed: 0,session_id,song_id_0,song_id_1,next_song_id
0,598,487545,320485,801419.0
1,1039,388680,82103,245916.0
2,1199,187475,82260,82260.0
3,1489,672342,163516,58880.0
4,1868,798178,736058,1001782.0


In [21]:
n = 3
testX = getTestX(test_source, n)
freq3 = getFreq(trainX, n)
result3 = predict(testX, freq3, n)
# result3.head()


# 3 2
result3_2 = merge2freq(result3, freq2)
# [3,2] 1
result3_2_1 = merge2freq(result3_2, freq1)
# [3,2,1] 20
result3_2_1["next_song_id"].fillna(result3_2_1["song_id_2"], inplace=True)
result = result3_2_1
result.head()

Left on: ['song_id_1', 'song_id_2']
Right on: ['song_id', 'next1_song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f2b8f60ceb0>
Left on: ['song_id_1']
Right on: ['song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f2b8f60ceb0>


Unnamed: 0,session_id,song_id_0,song_id_1,song_id_2,next_song_id
0,598,309466,487545,320485,801419.0
1,1039,810606,388680,82103,388680.0
2,1199,524289,187475,82260,419795.0
3,1489,674238,672342,163516,705778.0
4,1868,935208,798178,736058,710126.0


In [22]:
n = 5
testX = getTestX(test_source, n)
freq5 = getFreq(trainX, n)
result5 = predict(testX, freq5, n)

# 5 2
result5_2 = merge2freq(result5, freq2)

# [5,2] 20
result5_2["next_song_id"].fillna(result5_2["song_id_4"], inplace=True)
result = result5_2
result.head()

Left on: ['song_id_3', 'song_id_4']
Right on: ['song_id', 'next1_song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f2b8f60c9e0>


Unnamed: 0,session_id,song_id_0,song_id_1,song_id_2,song_id_3,song_id_4,next_song_id
0,598,442405,834888,309466,487545,320485,801419.0
1,1039,229992,252014,810606,388680,82103,82103.0
2,1199,524289,976710,524289,187475,82260,82260.0
3,1489,847611,642783,674238,672342,163516,163516.0
4,1868,742718,856473,935208,798178,736058,710126.0


In [84]:
# calculate ngram frequency.
# n=1 is the most frequent of song_id
# n=2 is the most frequent of [song_id and next1_song_id], get the result from next2_song_id column
def getFreq2(df, n=2, predict=1, threshold_value=0):
    # get df's n+predict numbers of song id
    df_train = getTrainData(df, n + predict - 1)
    # calculate ngram frequency
    df_freq = (
        df_train.groupby(["song_id"] + [f"next{i}_song_id" for i in range(1, n)])
        .value_counts(sort=True, normalize=True)
        .reset_index(name="freq")
    )

    # get the most frequent song_id, sort by freq
    df_freq = (
        df_freq.sort_values(
            ["song_id"] + [f"next{i}_song_id" for i in range(1, n)] + ["freq"],
            ascending=False,
        )
        .groupby(["song_id"] + [f"next{i}_song_id" for i in range(1, n)])
        .head(1)
    )

    df_freq = df_freq[df_freq["freq"] > threshold_value]
    return df_freq


# predict the next song id by ngram frequency table
def predict2(df, freqTable, n=2, target=2):
    df = pd.merge(
        df,
        freqTable,
        how="left",
        left_on=[f"song_id_{i}" for i in range(n)],
        right_on=["song_id"] + [f"next{_}_song_id" for _ in range(1, n)],
    )
    for i in range(1, n):
        del df[f"next{i}_song_id"]
    del df["song_id"]
    for i in range(target):
        df.rename(columns={f"next{n+i}_song_id": f"top{i+1}"}, inplace=True)
    df.sort_values("session_id")
    return df

In [85]:
# 5gen5
n = 5
target = 5
testX = getTestX(test_source, n)
freq5_5 = getFreq2(trainX, n, target, 0.7)
result5_5 = predict2(testX, freq5_5, n, target)

# 3gen5
n = 3
target = 5
freq3_5 = getFreq2(trainX, n, target, 0.7)
result3_5 = predict2(testX, freq3_5, n, target)

result_5 = result5_5.fillna(result3_5)


result_5.head()

Unnamed: 0,session_id,song_id_0,song_id_1,song_id_2,song_id_3,song_id_4,top1,top2,top3,top4,top5,freq
0,598,442405,834888,309466,487545,320485,,,,,,
1,1039,229992,252014,810606,388680,82103,,,,,,
2,1199,524289,976710,524289,187475,82260,,,,,,
3,1489,847611,642783,674238,672342,163516,,,,,,
4,1868,742718,856473,935208,798178,736058,,,,,,


In [86]:
n = 5
target = 4
freq = getFreq2(trainX, n, target, 0.7)
result = predict2(testX, freq, n, target)

result_5 = result_5.fillna(result)

n = 5
target = 3
freq = getFreq2(trainX, n, target, 0.7)
result = predict2(testX, freq, n, target)


result_5 = result_5.fillna(result)

In [87]:
n = 5
target = 2
freq = getFreq2(trainX, n, target, 0.7)
result = predict2(testX, freq, n, target)

result_5 = result_5.fillna(result)

In [88]:
n = 5
freq5 = getFreq(trainX, n)

# cal 5 2 1 20
testX = predict(testX, freq5, n)
testX = merge2freq(testX, freq2)
testX = merge2freq(testX, freq1)
testX["next_song_id"].fillna(testX["song_id_4"], inplace=True)

# combine
result_5["top1"] = result_5["top1"].fillna(testX["next_song_id"])

Left on: ['song_id_3', 'song_id_4']
Right on: ['song_id', 'next1_song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f2ab5260200>
Left on: ['song_id_3']
Right on: ['song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f2ab5260200>


In [89]:
col_name = [
    "session_id",
    "song_id_0",
    "song_id_1",
    "song_id_2",
    "song_id_3",
    "song_id_4",
]
testX = testX.drop(columns="song_id_0")
testX.columns = col_name

# cal 5 2 20
result5 = predict(testX, freq5, n)
testX = merge2freq(result5, freq2)
testX["next_song_id"].fillna(testX["song_id_4"], inplace=True)

# combine
result_5["top2"] = result_5["top2"].fillna(testX["next_song_id"])

Left on: ['song_id_3', 'song_id_4']
Right on: ['song_id', 'next1_song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f299c8e8120>


In [90]:
col_name = [
    "session_id",
    "song_id_0",
    "song_id_1",
    "song_id_2",
    "song_id_3",
    "song_id_4",
]
testX = testX.drop(columns="song_id_0")
testX.columns = col_name

# cal 5 2 20
result5 = predict(testX, freq5, n)
testX = merge2freq(result5, freq2)
testX["next_song_id"].fillna(testX["song_id_4"], inplace=True)

# combine
result_5["top3"] = result_5["top3"].fillna(testX["next_song_id"])

Left on: ['song_id_3', 'song_id_4']
Right on: ['song_id', 'next1_song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f299c8eaf10>


In [91]:
col_name = [
    "session_id",
    "song_id_0",
    "song_id_1",
    "song_id_2",
    "song_id_3",
    "song_id_4",
]
testX = testX.drop(columns="song_id_0")
testX.columns = col_name

# cal 5 2 20
result5 = predict(testX, freq5, n)
testX = merge2freq(result5, freq2)
testX["next_song_id"].fillna(testX["song_id_4"], inplace=True)

# combine
result_5["top4"] = result_5["top4"].fillna(testX["next_song_id"])

Left on: ['song_id_3', 'song_id_4']
Right on: ['song_id', 'next1_song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f299c8eaea0>


In [92]:
col_name = [
    "session_id",
    "song_id_0",
    "song_id_1",
    "song_id_2",
    "song_id_3",
    "song_id_4",
]
testX = testX.drop(columns="song_id_0")
testX.columns = col_name

# cal 5 2 20
result5 = predict(testX, freq5, n)
testX = merge2freq(result5, freq2)
testX["next_song_id"].fillna(testX["song_id_4"], inplace=True)

# combine
result_5["top5"] = result_5["top5"].fillna(testX["next_song_id"])

Left on: ['song_id_3', 'song_id_4']
Right on: ['song_id', 'next1_song_id']
<generator object merge2freq.<locals>.<genexpr> at 0x7f299c8e9d20>


In [93]:
result = result_5[["session_id", "top1", "top2", "top3", "top4", "top5"]]
result.head()

Unnamed: 0,session_id,top1,top2,top3,top4,top5
0,598,801419.0,946515.0,660101.0,413130.0,320485.0
1,1039,388680.0,388680.0,388680.0,388680.0,388680.0
2,1199,419795.0,419795.0,402292.0,454265.0,790371.0
3,1489,705778.0,705778.0,705778.0,705778.0,705778.0
4,1868,710126.0,1026681.0,970487.0,838919.0,770570.0


### Output to csv file

In [94]:
def songindex2songid(df):
    _df = df.copy()
    count = _df.shape[1]
    for i in range(1, 6):
        _df = _df.rename(columns={f"top{i}": "song_index"})
        _df = _df.merge(
            meta_song[["song_index", "song_id"]], on="song_index", how="left"
        )
        _df.rename(columns={"song_id": f"top{i}"}, inplace=True)
        _df.drop(columns=["song_index"], inplace=True)

    return _df

In [95]:
res = songindex2songid(result)

In [96]:
res.to_csv("results/result.csv", index=False)