# All imports necessary

In [None]:
import os

In [None]:
import sys

In [None]:
import warnings

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
sys.path.append('..')

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import auc

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
warnings.filterwarnings("ignore")

In [None]:
from sklearn.metrics import roc_curve

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from source.code.models.svdbasedrecommender import SVDBasedRecommender

In [None]:
from source.code.models.songfrequencybasedrecommender import SongFrequencyBasedRecommender

In [None]:
from source.code.transformers.metafeaturesextractor import MetaFeaturesExtractor

# Read the data

In [None]:
data_directory = '../data/datasets/'

## Song extra info

In [None]:
song_extra_info = pd.read_csv(os.path.join(data_directory, 'song_extra_info.csv'))

In [None]:
song_extra_info.head()

In [None]:
song_extra_info.info(verbose=True, null_counts=True)

## Train

In [None]:
train = pd.read_csv(os.path.join(data_directory, 'train.csv'), engine='python')

In [None]:
train.head().T

In [None]:
train.info(verbose=True, null_counts=True)

In [None]:
train['data_from'] = 'train'

In [None]:
train.msno.nunique()

In [None]:
train.song_id.nunique()

In [None]:
train.source_system_tab.nunique()

In [None]:
train.source_system_tab.value_counts()

In [None]:
train.source_screen_name.nunique()

In [None]:
train.source_screen_name.value_counts()

In [None]:
train.source_type.nunique()

In [None]:
train.source_type.value_counts()

In [None]:
train.fillna('unknown', inplace=True)

In [None]:
train.info(verbose=True, null_counts=True)

In [None]:
train.target.value_counts()

## Test

In [None]:
test = pd.read_csv(os.path.join(data_directory, 'test.csv'), engine='python', index_col=0)

In [None]:
test.head().T

In [None]:
test.info(verbose=True, null_counts=True)

In [None]:
test['target'] = None

In [None]:
test['data_from'] = 'test'

In [None]:
test.msno.nunique()

In [None]:
test.song_id.nunique()

In [None]:
test.source_system_tab.nunique()

In [None]:
test.source_system_tab.value_counts()

In [None]:
test.source_screen_name.nunique()

In [None]:
test.source_screen_name.value_counts()

In [None]:
test.source_type.nunique()

In [None]:
test.source_type.value_counts()

In [None]:
test.fillna('unknown', inplace=True)

In [None]:
test.info(verbose=True, null_counts=True)

## Songs

In [None]:
songs = pd.read_csv(os.path.join(data_directory, 'songs.csv'))

In [None]:
songs.head(4).T

In [None]:
songs.info(verbose=True, null_counts=True)

In [None]:
songs.lyricist.fillna('unknown', inplace=True)

In [None]:
songs.composer.fillna('unknown', inplace=True)

In [None]:
songs.genre_ids.fillna('unknown', inplace=True)

In [None]:
songs.language.fillna(-1, inplace=True)

In [None]:
songs.language = songs.language.astype(np.int64)

In [None]:
songs.head(4).T

In [None]:
songs.info(verbose=True, null_counts=True)

In [None]:
songs.song_length.nunique()

In [None]:
songs.genre_ids.nunique()

In [None]:
songs.artist_name.nunique()

In [None]:
songs.composer.nunique()

In [None]:
songs.lyricist.nunique()

In [None]:
songs.language.nunique()

## Members

In [None]:
members = pd.read_csv(os.path.join(data_directory, 'members.csv'))

In [None]:
members.head(4).T

In [None]:
members.info(verbose=True, null_counts=True)

In [None]:
members.fillna('unknown', inplace=True)

In [None]:
members.info(verbose=True, null_counts=True)

In [None]:
members.head(4).T

In [None]:
members.registration_init_time = pd.to_datetime(members.registration_init_time, format='%Y%m%d')

In [None]:
members.expiration_date = pd.to_datetime(members.expiration_date, format='%Y%m%d')

## Sample submission

In [None]:
sample_submission = pd.read_csv(os.path.join(data_directory, 'sample_submission.csv'))

In [None]:
sample_submission.head(10)

In [None]:
sample_submission.info(verbose=True, null_counts=True)

# Solutions

In [None]:
X, y = train[train.columns[:-2]], train[train.columns[-2]]

In [None]:
X_for_submission = test[test.columns[:-2]]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_for_submission.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
print(type(X_train), type(y_train))

In [None]:
print(X_test.shape, y_test.shape)

In [None]:
print(type(X_test), type(y_test))

## Naive frequency approach

In [None]:
naive_frequency_recommender = SongFrequencyBasedRecommender()

In [None]:
print('\t', cross_val_score(naive_frequency_recommender, X_train, y_train, cv=5))

In [None]:
naive_frequency_recommender.fit(X_train, y_train)

In [None]:
y_pred = naive_frequency_recommender.predict(X_test)

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example (Naive frequency approach)')
plt.legend(loc="lower right")
plt.show()

## SVD

In [None]:
svd_recommender = SVDBasedRecommender()

In [None]:
print('\t', cross_val_score(svd_recommender, X_train, y_train, cv=5))

In [None]:
svd_recommender.fit(X_train, y_train)

In [None]:
y_pred = svd_recommender.predict(X_test)

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example (SVD)')
plt.legend(loc="lower right")
plt.show()

## FM

In [None]:
categorical_features = [
    'source_system_tab',
    'source_screen_name',
    'city',
    'gender'
]

In [None]:
categorical_features_lang = [
    'language'
]

In [None]:
numerical_features = [
    'bd',
    'song_length',
    'days_registered'
]

In [None]:
num_features_pipeline = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('discretize', KBinsDiscretizer(n_bins=4, encode='onehot-dense'))
])

In [None]:
cat_features_pipeline = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [None]:
cat_features_pipeline_lang = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_features_pipeline, numerical_features),
        ('cat', cat_features_pipeline, categorical_features),
        ('cat_lang', cat_features_pipeline_lang, categorical_features_lang)
    ]
)

In [None]:
unified_pipeline = Pipeline(
    steps=[
        ('add_meta_info', MetaFeaturesExtractor(user_meta=members, item_meta=songs)),
        ('preprocessing', preprocessor)
    ]
)

In [None]:
X_train = unified_pipeline.fit_transform(X_train, y_train)

In [None]:
X_train.shape

In [None]:
from tffm import TFFMClassifier

In [None]:
import tensorflow as tf

In [None]:
model = TFFMClassifier(
    order=6,
    rank=10,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
    n_epochs=100,
    batch_size=-1,
    init_std=0.001,
    input_type='dense'
)

In [None]:
model.fit(X_train, y_train.values, show_progress=True)

In [None]:
X_test = unified_pipeline.transform(X_test)

In [None]:
X_test.shape

In [None]:
y_pred = model.predict_proba(X_test)

In [None]:
y_pred.shape

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred[:, 1])

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example (SVD)')
plt.legend(loc="lower right")
plt.show()