# All imports necessary

In [None]:
import sys

In [None]:
sys.path.append('..')

In [None]:
from source.code.models.songfrequencybasedrecommender import SongFrequencyBasedRecommender

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# Read the data

## Song extra info

In [None]:
data_directory = '../data/datasets/'

In [None]:
song_extra_info = pd.read_csv(os.path.join(data_directory, 'song_extra_info.csv'))

In [None]:
song_extra_info.head()

In [None]:
song_extra_info.info(verbose=True, null_counts=True)

## Train

In [None]:
train = pd.read_csv(os.path.join(data_directory, 'train.csv'), engine='python')

In [None]:
train.head()

In [None]:
train.info(verbose=True, null_counts=True)

In [None]:
train['data_from'] = 'train'

In [None]:
train.msno.nunique()

In [None]:
train.song_id.nunique()

In [None]:
train.source_system_tab.nunique()

In [None]:
train.source_system_tab.value_counts()

In [None]:
train.source_screen_name.nunique()

In [None]:
train.source_screen_name.value_counts()

In [None]:
train.source_type.nunique()

In [None]:
train.source_type.value_counts()

In [None]:
train.fillna('unknown', inplace=True)

In [None]:
train.info(verbose=True, null_counts=True)

## Test

In [None]:
test = pd.read_csv(os.path.join(data_directory, 'test.csv'), engine='python', index_col=0)

In [None]:
test.head()

In [None]:
test.info(verbose=True, null_counts=True)

In [None]:
test['target'] = None

In [None]:
test['data_from'] = 'test'

In [None]:
test.msno.nunique()

In [None]:
test.song_id.nunique()

In [None]:
test.source_system_tab.nunique()

In [None]:
test.source_system_tab.value_counts()

In [None]:
test.source_screen_name.nunique()

In [None]:
test.source_screen_name.value_counts()

In [None]:
test.source_type.nunique()

In [None]:
test.source_type.value_counts()

In [None]:
test.fillna('unknown', inplace=True)

In [None]:
test.info(verbose=True, null_counts=True)

## Songs

In [None]:
songs = pd.read_csv(os.path.join(data_directory, 'songs.csv'))

In [None]:
songs.head()

In [None]:
songs.info(verbose=True, null_counts=True)

In [None]:
songs.lyricist.fillna('unknown', inplace=True)

In [None]:
songs.composer.fillna('unknown', inplace=True)

In [None]:
songs.language.fillna(-1, inplace=True)

In [None]:
songs.language = songs.language.astype(np.int64)

In [None]:
songs.head()

In [None]:
songs.info(verbose=True, null_counts=True)

In [None]:
songs.song_length.nunique()

In [None]:
songs.genre_ids.nunique()

In [None]:
songs.artist_name.nunique()

In [None]:
songs.composer.nunique()

In [None]:
songs.lyricist.nunique()

In [None]:
songs.language.nunique()

## Members

In [None]:
members = pd.read_csv(os.path.join(data_directory, 'members.csv'))

In [None]:
members.head()

In [None]:
members.info(verbose=True, null_counts=True)

In [None]:
members.fillna('unknown', inplace=True)

In [None]:
members.info(verbose=True, null_counts=True)

In [None]:
members.head()

## Sample submission

In [None]:
sample_submission = pd.read_csv(os.path.join(data_directory, 'sample_submission.csv'))

In [None]:
sample_submission.head(10)

In [None]:
sample_submission.info(verbose=True, null_counts=True)

# Solutions

In [None]:
X, y = train[train.columns[:-2]], train[train.columns[-2]]

In [None]:
X_for_submission = test[test.columns[:-2]]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_for_submission.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=42,
    stratify=y
)

## Naive frequency approach

In [None]:
naive_frequency_recommender = SongFrequencyBasedRecommender()

In [None]:
print('\t', cross_val_score(naive_frequency_recommender, X_train, y_train, cv=5))

In [None]:
naive_frequency_recommender.fit(X_train, y_train)

In [None]:
y_pred = naive_frequency_recommender.predict(X_test)

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

## FP-GROWTH

In [None]:
user2index = dict(zip(X_train.msno.unique(), range(len(X_train.msno.unique()))))

In [None]:
song2index = dict(zip(X_train.song_id.unique(), range(len(X_train.song_id.unique()))))

In [None]:
index2user = dict(zip(range(len(X_train.msno.unique())), X_train.msno.unique()))

In [None]:
index2song = dict(zip(range(len(X_train.song_id.unique())), X_train.song_id.unique()))

In [None]:
baskets = pd.DataFrame({'msno': X_train.msno.map(user2index), 'song_id': X_train.song_id.map(song2index)})

In [None]:
baskets.head()

In [None]:
baskets = baskets.groupby('msno')['song_id'].aggregate(list).reset_index()

In [None]:
baskets.head()

In [None]:
import pyfpgrowth
import sys
sys.setrecursionlimit(100000)

In [None]:
patterns = pyfpgrowth.find_frequent_patterns(baskets['song_id'].values, 2)

In [None]:
rules = pyfpgrowth.generate_association_rules(patterns, 0.1)

## SVD