In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, make_scorer

In [4]:
train = pd.read_csv('../data/train.csv',
                    dtype={
                        'genre_id': 'category',
                        'media_id': 'category',
                        'context_type': 'category',
                        'platform_name': 'category',
                        'platform_family': 'category',
                        'listen_type': 'category',
                        'user_gender': np.bool,
                        'user_id': 'category',
                        'artist_id': 'category',
                        'is_listened': np.bool
                    })

In [5]:
sample = train.sample(frac=1)

In [6]:
X = sample.drop('is_listened', axis=1).as_matrix()

In [7]:
Y = sample.is_listened.as_matrix()

In [8]:
clf = GaussianNB()
pipe = Pipeline([('reduce_dim', PCA()), ('clf', clf)])
scores = cross_val_score(pipe, X, Y, cv=5, scoring=make_scorer(roc_auc_score))
scores

array([ 0.50591372,  0.50515413,  0.50764642,  0.50543652,  0.50566428])

In [9]:
pipe.fit(X, Y)

Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', GaussianNB(priors=None))])

In [10]:
train = pd.read_csv('../data/test.csv',
                    dtype={
                        'genre_id': 'category',
                        'media_id': 'category',
                        'context_type': 'category',
                        'platform_name': 'category',
                        'platform_family': 'category',
                        'listen_type': 'category',
                        'user_gender': np.bool,
                        'user_id': 'category',
                        'artist_id': 'category'
                    })

In [29]:
is_listened = pipe.predict_proba(train.drop('sample_id', axis=1).as_matrix())[:, 1]

In [32]:
output = pd.DataFrame({'sample_id': train.sample_id, 'is_listened': is_listened}, columns=['sample_id', 'is_listened'])

In [34]:
output.to_csv('submission.csv', index=False)