In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np 
from tqdm import tqdm

# Data Preprocessing

* Filtered Data

In [2]:
k = 1

In [3]:
train_raw_df = pd.read_parquet("./datagame-2023/label_train_source.parquet").sort_values(by=['session_id', 'listening_order'])

* We expand song IDs into columns

In [4]:

def convert(df):
    session_per_song = df.groupby("session_id").head(20)
    new_df = session_per_song.pivot_table(values='song_id', index='session_id', columns=session_per_song.groupby("session_id").cumcount(), aggfunc='first')
    new_df.columns = [f'top{column}' for column in new_df.columns]
    return new_df.reset_index()

In [5]:
meta_song = pd.read_parquet("./datagame-2023/meta_song.parquet")
train_raw_meta = train_raw_df.merge(meta_song, on='song_id', how='left')

* For later use, meta data will be expanded into columns as well

In [6]:
def convert_meta(df, title, k):
    # Count the frequency of each value in the specified column
    top_values = df[title].value_counts().nlargest(k).index.tolist()

    # Filter the DataFrame to include only rows with the top k values
    df_top_k = df[df[title].isin(top_values)]

    # Perform the pivot and rename columns as before
    session_per_song = df_top_k.groupby("session_id").head(20)
    new_df = session_per_song.pivot_table(values=title, index='session_id', columns=session_per_song.groupby("session_id").cumcount(), aggfunc='first')
    new_df.columns = [f'{title}_top{column}' for column in new_df.columns]

    return new_df.reset_index()

* Our chosen features can be modifed. Here, I pick up the composer and artist IDs

In [7]:
class MetaData:
    composer_song = pd.read_parquet("./datagame-2023/meta_song_composer.parquet")
    meta_song = pd.read_parquet("./datagame-2023/meta_song.parquet")
    genre_song = pd.read_parquet("./datagame-2023/meta_song_genre.parquet")
    lyricist = pd.read_parquet("./datagame-2023/meta_song_lyricist.parquet")
    producer = pd.read_parquet("./datagame-2023/meta_song_producer.parquet")
    meta_map = {'composer_id': composer_song, 'artist_id': meta_song}

In [8]:

train_df = convert(train_raw_df)
train_df['variety'] = train_df.drop("session_id", axis=1).apply(lambda row: len(row.unique()), axis=1)
filtered_training_df = train_df[train_df['variety'] == 20]
filtered_training_df = filtered_training_df.drop("variety", axis=1).reset_index(drop=True)

In [9]:
excluded_data = pd.read_csv("./excluded_data.csv")
same_df = pd.read_csv("./same_df.csv")
test_raw_df = pd.read_parquet("./datagame-2023/label_test_source.parquet").sort_values(by=['session_id', 'listening_order'])



In [10]:
def merge_meta(first, raw):
    k = 2
    for key, data in MetaData.meta_map.items():
        first[key] = first.merge(convert_meta(raw[['song_id', 'session_id']].merge(data, on='song_id', how='left'), key, k).iloc[:, :2], on='session_id', how='left').iloc[:, -1]
    return first
merged_df = merge_meta(filtered_training_df.copy(), train_raw_df)

In [11]:
merged_test = merge_meta(excluded_data.copy(), test_raw_df)

# Model Building

* We pick up the vectorizer to convert string data into numerical one

In [12]:
v = TfidfVectorizer()

In [13]:
matrix = v.fit_transform(merged_df.drop("session_id", axis=1).astype(str).apply(lambda row: ' '.join(row), axis=1).to_list() + merged_test.drop("session_id", axis=1).astype(str).apply(lambda row: ' '.join(row), axis=1).to_list())


In [14]:
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(matrix)

In [15]:
merged_test = merged_test.astype(str)
merged_test['session_id'] = merged_test['session_id'].astype(int)

In [16]:
processed_test_df = merged_test.drop("session_id", axis=1).apply(lambda row: ' '.join(row), axis=1)

* We split the predicting data into 100 segments

In [17]:

split_parts = np.array_split(processed_test_df, 100)

# Predicting

In [18]:


index_list = []

# Assuming split_parts is a list of DataFrames
for i, part in enumerate(tqdm(split_parts, desc="Processing", unit="part")):
    segment_matrix = v.transform(part)
    knn_distances, knn_indices = knn.kneighbors(segment_matrix, n_neighbors=10)
    small_list = []
    for row_indices in knn_indices:
       found_index = None
       for index in row_indices[1:]:  # Skip the first element
            if index <= filtered_training_df.index.max():
                found_index = index
                break
       small_list.append(found_index)
    index_list.extend(small_list)


Processing:   0%|          | 0/100 [00:00<?, ?part/s]

Processing: 100%|██████████| 100/100 [06:38<00:00,  3.99s/part]


* Now our predicting indices are collected, we are looking to find their target segments

In [19]:
new_list = [0 if element == None else element for element in index_list]

In [20]:
target_df = pd.read_parquet('./datagame-2023/label_train_target.parquet')

FileNotFoundError: [Errno 2] No such file or directory: './datagame-2023/label_train_target.parquet'

* The target data is converted here

In [None]:
converted_target = convert(target_df)

# Output and exporting

In [None]:
sample_df = pd.read_csv("./datagame-2023/sample.csv")
processed_df = pd.read_csv('./included_data.csv')

In [None]:
final_df = filtered_training_df.iloc[new_list][['session_id']].merge(converted_target, on='session_id', how='left')

In [None]:

final_df['session_id'] = excluded_data['session_id']

In [None]:
sample_id = sample_df[['session_id']]

In [None]:
final_df.columns = ['session_id'] + [f'top{i + 1}' for i in range(5)]

In [None]:
sample_id.merge(pd.concat([processed_df, final_df], axis=0), on='session_id', how='left').to_csv("data_with_some_features.csv", index=False)