In [1]:
!nvidia-smi

Sat Nov 27 15:49:38 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 1.3 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


In [3]:
!mkdir data
%cd data
!wget https://voudy-data.s3.eu-north-1.amazonaws.com/wsdm_data.tar.gz
!tar xvzf wsdm_data.tar.gz
%cd ..
!mkdir src

/content/data
--2021-11-27 15:49:51--  https://voudy-data.s3.eu-north-1.amazonaws.com/wsdm_data.tar.gz
Resolving voudy-data.s3.eu-north-1.amazonaws.com (voudy-data.s3.eu-north-1.amazonaws.com)... 52.95.169.76
Connecting to voudy-data.s3.eu-north-1.amazonaws.com (voudy-data.s3.eu-north-1.amazonaws.com)|52.95.169.76|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 613394184 (585M) [application/x-gzip]
Saving to: ‘wsdm_data.tar.gz’


2021-11-27 15:50:49 (10.3 MB/s) - ‘wsdm_data.tar.gz’ saved [613394184/613394184]

._members.csv
members.csv
._song_extra_info.csv
song_extra_info.csv
._songs.csv
songs.csv
._train.csv
train.csv
/content


In [4]:
!touch src/main.py
%cd src

# Part 1. Catboost

In [20]:
import functools
import catboost as cb
import pandas as pd
import numpy as np
from catboost import CatBoostRanker
from sklearn.model_selection import train_test_split

First we read the data

In [22]:
data_path = "../data/"
train = pd.read_csv(
    data_path + "train.csv",
    dtype={
        "msno": "category",
        "source_system_tab": "category",
        "source_screen_name": "category",
        "source_type": "category",
        "target": np.uint8,
        "song_id": "category",
    },
)
songs = pd.read_csv(
    data_path + "songs.csv",
    dtype={
        "genre_ids": "category",
        "language": "category",
        "artist_name": "category",
        "composer": "category",
        "lyricist": "category",
        "song_id": "category",
    },
)
members = pd.read_csv(
    data_path + "members.csv",
    dtype={"city": "category", "bd": np.uint8, "gender": "category", "registered_via": "category"},
    parse_dates=["registration_init_time", "expiration_date"],
)
songs_extra = pd.read_csv(data_path + "song_extra_info.csv")

Then split the dataset into 2 part. CV is not really necessary since the dataset is big enough.

In [23]:
train, val = train_test_split(train, train_size=0.8, random_state=42)

Now we have to preprocess datasets with additional information before merge.

In [24]:
def preprocess_data(members, songs, songs_extra):
    members["membership_days"] = (
        members["expiration_date"].subtract(members["registration_init_time"]).dt.days.astype(int)
    )

    members["registration_year"] = members["registration_init_time"].dt.year
    members["registration_month"] = members["registration_init_time"].dt.month
    members["registration_date"] = members["registration_init_time"].dt.day

    members["expiration_year"] = members["expiration_date"].dt.year
    members["expiration_month"] = members["expiration_date"].dt.month
    members["expiration_date"] = members["expiration_date"].dt.day
    members = members.drop(["registration_init_time"], axis=1)

    def isrc_to_year(isrc):
        if type(isrc) == str:
            if int(isrc[5:7]) > 17:
                return 1900 + int(isrc[5:7])
            else:
                return 2000 + int(isrc[5:7])
        else:
            return np.nan

    songs_extra["song_year"] = songs_extra["isrc"].apply(isrc_to_year)
    songs_extra.drop(["isrc", "name"], axis=1, inplace=True)

    return members, songs, songs_extra

In [25]:
print("Preprocessing data...")
members, songs, songs_extra = preprocess_data(members, songs, songs_extra)

Preprocessing data...


Merging all the data into 1 dataset.

In [26]:
def create_ds(df, songs, members, songs_extra):
    df = df.merge(songs, on="song_id", how="left")
    df = df.merge(members, on="msno", how="left")
    df = df.merge(songs_extra, on="song_id", how="left")

    df.song_length.fillna(200000, inplace=True)
    df.song_length = df.song_length.astype(np.uint32)
    df.song_id = df.song_id.astype("category")

    return df

In [28]:
print("Merging data...")
train = create_ds(train, songs, members, songs_extra)
val = create_ds(val, songs, members, songs_extra)

Merging data...


Feature engineering time!

In [29]:
def lyricist_count(x):
    if x == "no_lyricist":
        return 0
    else:
        return sum(map(x.count, ["|", "/", "\\", ";"])) + 1


def composer_count(x):
    if x == "no_composer":
        return 0
    else:
        return sum(map(x.count, ["|", "/", "\\", ";"])) + 1


def is_featured(x):
    if "feat" in str(x):
        return 1
    return 0


def artist_count(x):
    if x == "no_artist":
        return 0
    else:
        return x.count("and") + x.count(",") + x.count("feat") + x.count("&")


# is song language 17 or 45.
def song_lang_boolean(x):
    if "17.0" in str(x) or "45.0" in str(x):
        return 1
    return 0


def smaller_song(x, mean_song_length):
    if x < mean_song_length:
        return 1
    return 0


def process_df(df):
    df_agg = df.groupby("msno").aggregate("count")
    df_agg = df_agg[df_agg["song_id"] < 1024]

    users = set(df_agg.index)

    df = df[df["msno"].isin(users)]

    df["lyricist"] = df["lyricist"].cat.add_categories(["no_lyricist"])
    df["lyricist"].fillna("no_lyricist", inplace=True)
    df["lyricists_count"] = df["lyricist"].apply(lyricist_count).astype(np.int8)

    df["composer"] = df["composer"].cat.add_categories(["no_composer"])
    df["composer"].fillna("no_composer", inplace=True)
    df["composer_count"] = df["composer"].apply(composer_count).astype(np.int8)

    df["artist_name"] = df["artist_name"].cat.add_categories(["no_artist"])
    df["artist_name"].fillna("no_artist", inplace=True)
    df["is_featured"] = df["artist_name"].apply(is_featured).astype(np.int8)

    df["artist_count"] = df["artist_name"].apply(artist_count).astype(np.int8)

    # if artist is same as composer
    df["artist_composer"] = (np.asarray(df["artist_name"]) == np.asarray(df["composer"])).astype(np.int8)

    # if artist, lyricist and composer are all three same
    df["artist_composer_lyricist"] = (
        (np.asarray(df["artist_name"]) == np.asarray(df["composer"]))
        & np.asarray((df["artist_name"]) == np.asarray(df["lyricist"]))
        & np.asarray((df["composer"]) == np.asarray(df["lyricist"]))
    ).astype(np.int8)

    df["song_lang_boolean"] = df["language"].apply(song_lang_boolean).fillna(0).astype(np.int8)

    mean_song_length = np.mean(df["song_length"])

    df["smaller_song"] = (
        df["song_length"].apply(functools.partial(smaller_song, mean_song_length=mean_song_length)).astype(np.int8)
    )

    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].fillna("nan").astype('category')

    df = df.sort_values(by="msno")

    y = df.target
    q = df.msno
    X = df.drop(["target", "msno"], axis=1)

    return X, y, q

In [30]:
print("Processing data...")
X_train, y_train, q_train = process_df(train)
X_val, y_val, q_val = process_df(val)


Processing data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Since we're going to be using CatBoost we have to define cat features explicitly.

In [31]:
cat_features = list(X_train.select_dtypes(["category"]))

arr = [col for col in cat_features if X_train[col].isnull().values.any()]

X_train[arr] = X_train[arr].astype(str).fillna("nan").astype("category")
X_val[arr] = X_val[arr].astype(str).fillna("nan").astype("category")

In [32]:
train = cb.Pool(
    data=X_train,
    label=y_train,
    group_id=q_train,
    cat_features=cat_features
)

val = cb.Pool(
    data=X_val,
    label=y_val,
    group_id=q_val,
    cat_features=cat_features
)

Finally we can start training.

In [33]:
def train_cb(train, val):
    default_parameters = {
        'custom_metric': ["NDCG", "QueryAUC", "AUC", 'AverageGain:top=10'],
        'random_seed': 42,
        "loss_function": "YetiRank",
        "train_dir": "YetiRank",
        "metric_period": 50
    }

    parameters = {
        "learning_rate": 0.1,
        "iterations": 200,
        "task_type": "GPU",
        **default_parameters
    }

    model = CatBoostRanker(**parameters)
    model.fit(train, eval_set=val, plot=True, verbose=1)

    metrics = model.eval_metrics(data=val, metrics=["NDCG", "QueryAUC", "AUC"])

    return model, metrics

In [34]:
print("Fitting model...")
model, metrics = train_cb(train, val)

Fitting model...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Groupwise loss function. OneHotMaxSize set to 10


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric QueryAUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric AverageGain:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7349509	best: 0.7349509 (0)	total: 3.48s	remaining: 11m 32s
50:	test: 0.7821481	best: 0.7821481 (50)	total: 1m 54s	remaining: 5m 34s
100:	test: 0.7845300	best: 0.7845300 (100)	total: 3m 53s	remaining: 3m 49s
150:	test: 0.7853413	best: 0.7853413 (150)	total: 5m 57s	remaining: 1m 55s
199:	test: 0.7857538	best: 0.7857538 (199)	total: 8m 1s	remaining: 0us
bestTest = 0.7857538027
bestIteration = 199


In [35]:
for name, vals in metrics.items():
    print(f"{name}: {vals[-1]: .4f}")

NDCG:type=Base:  0.8726
QueryAUC:  0.5432
AUC:  0.6909


# Part 2. W2V embeddings