In [35]:
import os
import sys
import json
import random
import time
from typing import Dict, List, Tuple, Iterable

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
from tqdm import tqdm
from pandas.core.groupby.generic import DataFrameGroupBy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [36]:
# Load data

train_df = pd.read_csv("../data/sampled_train.csv")
songs_df = pd.read_csv("../data/songs.csv")
song_extra_info_df = pd.read_csv("../data/song_extra_info.csv")
members_df = pd.read_csv("../data/members.csv")




In [37]:
# Preprocess data
# deal with NaN on song_df

# check Nan distribution for song_df
nan_dist = songs_df.isna().sum() / len(songs_df)
nan_dist.round(2)

song_id        0.00
song_length    0.00
genre_ids      0.04
artist_name    0.00
composer       0.47
lyricist       0.85
language       0.00
dtype: float64

In [38]:
# fill Nan
for col in ['song_length', 'genre_ids', 'artist_name', 'language']:
    songs_df[col] = songs_df[col].fillna("Unknown")

In [39]:
nan_dist = songs_df.isna().sum() / len(songs_df)
nan_dist.round(2)

song_id        0.00
song_length    0.00
genre_ids      0.00
artist_name    0.00
composer       0.47
lyricist       0.85
language       0.00
dtype: float64

In [40]:
# for composer and lyricist, we can fill with artist_name
songs_df['composer'] = songs_df['composer'].fillna(songs_df['artist_name'])
songs_df['lyricist'] = songs_df['lyricist'].fillna(songs_df['artist_name'])


In [41]:
nan_dist = songs_df.isna().sum() / len(songs_df)
nan_dist

song_id        0.0
song_length    0.0
genre_ids      0.0
artist_name    0.0
composer       0.0
lyricist       0.0
language       0.0
dtype: float64

In [42]:
# check Nan distribution for train_df
nan_dist = train_df.isna().sum() / len(train_df)
nan_dist

msno                  0.000000
song_id               0.000000
source_system_tab     0.003408
source_screen_name    0.055134
source_type           0.002839
target                0.000000
dtype: float64

In [43]:
# show percentage of different values in source_system_tab and source_screen_name
source_system_tab_dist = train_df['source_system_tab'].value_counts() / len(train_df)
source_screen_name_dist = train_df['source_screen_name'].value_counts() / len(train_df)
print(source_system_tab_dist)
print(source_screen_name_dist)

source_system_tab
my library      0.521440
discover        0.281812
search          0.080646
radio           0.061452
listen with     0.027663
explore         0.022460
notification    0.000800
settings        0.000318
Name: count, dtype: float64
source_screen_name
Local playlist more     0.460405
Online playlist more    0.167155
Radio                   0.061031
Album more              0.054944
Search                  0.038576
Artist more             0.033034
Discover Feature        0.032545
Discover Chart          0.026347
Others profile more     0.026296
My library              0.010886
Discover Genre          0.010294
Explore                 0.009790
Unknown                 0.006986
Discover New            0.002139
Search Home             0.001782
Search Trends           0.001757
My library_Search       0.000864
Self profile more       0.000030
Payment                 0.000004
Concert                 0.000002
Name: count, dtype: float64


In [44]:
# fill Nan for train_df with 'Unknown'
for col in ['source_system_tab', 'source_screen_name', 'source_type']:
    train_df[col] = train_df[col].fillna("Unknown")

In [45]:
nan_dist = train_df.isna().sum() / len(train_df)
nan_dist

msno                  0.0
song_id               0.0
source_system_tab     0.0
source_screen_name    0.0
source_type           0.0
target                0.0
dtype: float64

In [46]:
# check Nan distribution for members_df
nan_dist = members_df.isna().sum() / len(members_df)
nan_dist

msno                      0.000000
city                      0.000000
bd                        0.000000
gender                    0.578496
registered_via            0.000000
registration_init_time    0.000000
expiration_date           0.000000
dtype: float64

In [47]:
# fill Nan for members_df with 'Unknown'
# members_df['gender'] = members_df['gender'].fillna("Unknown")

# members_df['bd'] = members_df['bd'].apply(lambda x: np.nan if x <= 0 or x >= 75 else x)
# members_df.isna().sum() / len(members_df)


In [48]:
# # check gender 'Unknown's correlation with bd is NaN
# members_df[members_df['gender']=='Unknown']['bd'].isna().sum() / len(members_df)

## current data preprocessing
- Members: get rid of bd and gender col
- Songs: filled NaN composer and lyricist with artist_name
- Train: filled NaN source_system_tab, source_screen_name, source_type with 'Unknown'

In [49]:
train_df['source_screen_name'].unique()

array(['Explore', 'Local playlist more', 'My library',
       'Online playlist more', 'Unknown', 'Discover Feature', 'Radio',
       'Discover Chart', 'Artist more', 'Album more', 'Search Trends',
       'Others profile more', 'Search', 'Discover Genre',
       'My library_Search', 'Discover New', 'Search Home',
       'Self profile more', 'Payment', 'Concert'], dtype=object)

In [50]:
train_df['source_system_tab'].unique()

array(['explore', 'my library', 'discover', 'radio', 'search', 'Unknown',
       'listen with', 'notification', 'settings'], dtype=object)

In [51]:
train_df['source_type'].unique()

array(['online-playlist', 'local-playlist', 'local-library',
       'song-based-playlist', 'album', 'radio', 'top-hits-for-artist',
       'Unknown', 'song', 'artist', 'listen-with',
       'topic-article-playlist'], dtype=object)

In [52]:
songs_df['language'].unique()

array([3.0, 31.0, 52.0, 17.0, 10.0, -1.0, 24.0, 59.0, 45.0, 38.0,
       'Unknown'], dtype=object)

In [54]:
# train and test split
tr_df, val_df = train_test_split(train_df, test_size=0.2, shuffle=False)
tr_song_df = tr_df.merge(songs_df, how='inner', on='song_id')
tr_song_df['genre_ids'] = tr_song_df['genre_ids'].astype(str).str.split('|')
tr_mem_song_df = tr_song_df.merge(members_df, how='inner', on='msno')