# Preprocessing

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
date_columns = ['expiration_date', 'registration_init_time']

train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv', index_col=0)
item_data = pd.read_csv('../data/songs.csv')
user_data = pd.read_csv('../data/members.csv', parse_dates=date_columns)

  mask |= (ar1 == a)


In [3]:
all_data = pd.concat([train_data, test_data])

all_data = all_data.merge(item_data, on='song_id', how='left')
all_data = all_data.merge(user_data, on='msno', how='left')

In [4]:
enc = LabelEncoder()

for col in [
    'msno', 'song_id', 'source_screen_name', 
    'source_system_tab', 'source_type', 'genre_ids', 
    'artist_name', 'composer', 'lyricist', 'gender'
]:
    all_data[col] = enc.fit_transform(all_data[col].fillna('nan'))
    
for col in ['language', 'city', 'registered_via']:
    all_data[col] = enc.fit_transform(all_data[col].fillna(-2)) 

In [6]:
print (all_data.dtypes)

msno                               int64
song_id                            int64
source_screen_name                 int64
source_system_tab                  int64
source_type                        int64
target                           float64
song_length                      float64
genre_ids                          int64
artist_name                        int64
composer                           int64
lyricist                           int64
language                           int64
city                               int64
bd                                 int64
gender                             int64
registered_via                     int64
registration_init_time    datetime64[ns]
expiration_date           datetime64[ns]
dtype: object


In [8]:
all_data['time'] = all_data.index / len(all_data)

n = len(train_data)
train_data = all_data[:n]
test_data = all_data[n:]

train_data.to_hdf('../data/train_data.hdf', key='wsdm')
test_data.to_hdf('../data/test_data.hdf', key='wsdm')  

# Create features

In [10]:
from itertools import combinations

import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse import coo_matrix
from lightfm import LightFM
from sklearn.preprocessing import LabelEncoder



In [11]:
date_cols = ['expiration_date', 'registration_init_time']

train_data = pd.read_hdf('../data/train_data.hdf', parse_dates=date_cols)
test_data = pd.read_hdf('../data/test_data.hdf', parse_dates=date_cols)

In [12]:
all_data = pd.concat([train_data, test_data])

In [19]:
df_test = test_data
df_history_test = train_data

df_final_trains = []
df_final_history_trains = []

n = len(test_data)
shift = int(0.05*len(train_data))

for i in range(2):
    m = -i*shift
    print (m, shift, i, -(n + i*shift), n)
    if m == 0:
        m = None
    df_final_trains.append(train_data[-(n + i*shift):m])
    df_final_history_trains.append(train_data[:-(n + i*shift)])
    print (len(df_final_trains[-1]))
    print (len(df_final_history_trains[-1]))

0 368870 0 -2556790 2556790
2556790
4820628
-368870 368870 1 -2925660 2556790
2556790
4451758


In [20]:
not_categorical_columns = [
    'target', 
    'song_length', 
    'registration_init_time', 
    'expiration_date', 
    'time', 
    'bd',
]
categorical_columns = all_data.columns.difference(not_categorical_columns)

orders = {}

for col in categorical_columns:
    orders[col] = 10 ** (int(np.log(all_data[col].max() + 1) / np.log(10)) + 1)

In [21]:
for col in orders:
    print (col, orders[col])

artist_name 100000
city 100
song_id 1000000
gender 10
lyricist 100000
source_type 100
composer 100000
registered_via 10
genre_ids 1000
language 100
msno 100000
source_screen_name 100
source_system_tab 100


In [22]:
def get_group(df, cols):
    
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]
        
    return group


def mean(df_history, df, cols):
    
    group = get_group(df, cols)
    group_history = get_group(df_history, cols)
    
    mean_map = df_history.groupby(group_history).target.mean()
    
    return group.map(mean_map).fillna(-1)


def count(df_history, df, cols):
    
    group = get_group(df, cols)
    group_all = get_group(all_data, cols)
    
    count_map = group_all.value_counts()
    
    return group.map(count_map).fillna(0)


def regression(df_history, df, cols):
    
    group = get_group(df, cols)
    group_history = get_group(df_history, cols)
    
    targets = {}
    times = {}
    for (y, t), u in zip(df_history[['target', 'time']].values, group_history):
        if u not in targets:
            targets[u] = [y]
            times[u] = [t]
        else:
            targets[u].append(y)
            times[u].append(t)
            
    linal_user = {}
    for u in times:
        if len(times[u]) > 1:
            A = np.vstack([times[u], np.ones(len(times[u]))]).T
            linal_user[u] = np.linalg.inv(A.T.dot(A)).dot(A.T).dot(targets[u])
    
    result = []
    
    for t, u in zip(df['time'], group):
        if u not in times:
            result.append(0.5)
        else:
            if len(times[u]) < 2:
                result.append(0.5)
            else:
                result.append(linal_user[u].dot([t, 1]))

    return result


def time_from_prev_heard(df_history, df, cols):
    
    group = get_group(df, cols)
    group_history = get_group(df_history, cols)

    last_heard = df_history.groupby(group_history).time.last().to_dict()

    result = []
    for t, g in zip(df.time, group):
        if g in last_heard:
            result.append(t - last_heard[g])
        else:
            result.append(-1)
        last_heard[g] = t
        
    return result


def time_to_next_heard(df_history, df, cols):
    
    result = []
    df_reverse = df.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)
    
    next_heard = {}
    for g, t in zip(group, df_reverse.time):
        if g in next_heard:
            result.append(t - next_heard[g])
        else:
            result.append(-1)
        next_heard[g] = t
    
    result.reverse()
    return result


def count_from_future(df_history, df, cols):
    
    result = []
    df_reverse = df.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)
    
    count = {}
    for g in group.values:
        if g in count:
            result.append(count[g])
            count[g] += 1 
        else:
            result.append(0)
            count[g] = 1
    
    result.reverse()
    return result


def count_from_past(df_history, df, cols):
    
    group = get_group(df, cols)
    
    count = {}
    result = []
    for g in group.values:
        if g not in count:
            count[g] = 0
        else:
            count[g] += 1
        result.append(count[g])
        
    return result


def last_time_diff(df_history, df, cols):
    
    group = get_group(df, cols)
        
    last_time = df.groupby(group).time.last()
    
    return group.map(last_time) - df.time


def part_of_unique_song(df):
    
    group = get_group(all_data, ['msno', 'artist_name'])
    group_df = get_group(df, ['msno', 'artist_name'])
    
    num_song_by_artist = all_data.groupby('artist_name').song_id.nunique()  
    num_song_by_user_artist = all_data.groupby(group).song_id.nunique()
    
    s1 = df.artist_name.map(num_song_by_artist)
    s2 = group_df.map(num_song_by_user_artist)
    
    return s2 / s1


def matrix_factorization(df, df_history):
    
    cols = ['msno', 'source_type']
    group = get_group(df, cols)
    group_history = get_group(df_history, cols)

    encoder = LabelEncoder()
    encoder.fit(pd.concat([group, group_history]))
    
    df['user_id'] = encoder.transform(group)
    df_history['user_id'] = encoder.transform(group_history)

    num_users = max(df.user_id.max(), df_history.user_id.max()) + 1
    num_items = max(df.song_id.max(), df_history.song_id.max()) + 1
    num_msno = max(df.msno.max(), df_history.msno.max()) + 1

    M = coo_matrix(
        (df_history.target, ( df_history.user_id, df_history.song_id)),
        shape=(num_users, num_items)
    )

    user_features = pd.concat([df, df_history])[['msno', 'user_id']].drop_duplicates()

    user_features = coo_matrix(
        (np.ones(len(user_features)), (user_features.user_id, user_features.msno)),
        shape=(num_users, num_msno)
    )

    user_features = sp.hstack([sp.eye(num_users), user_features])

    model = LightFM(no_components=50, learning_rate=0.1)

    model.fit(
        M, 
        epochs=2, 
        num_threads=50, 
        user_features=user_features,
    )
    result = model.predict(
        df.user_id.values, 
        df.song_id.values, 
        user_features=user_features,
    )
    
    return result

In [23]:
def col_name(cols, func):
    return '_'.join(cols) + '_' + func.__name__


def create_features(df, df_history):
    
    X = pd.DataFrame()
    
    for num_col in [1, 2]:
        for cols in combinations(categorical_columns, num_col):
            for func in [
                mean, 
                count, 
                time_to_next_heard, 
                count_from_future,
                last_time_diff, 
                count_from_past
            ]:
                X[col_name(cols, func)] = func(df_history, df, list(cols))
    
    for cols in combinations(categorical_columns, 3):
        for func in [mean, count]:
            X[col_name(cols, func)] = func(df_history, df, list(cols))
        if 'msno' in cols:
            for func in [time_to_next_heard, last_time_diff, count_from_past]:
                X[col_name(cols, func)] = func(df_history, df, list(cols))

    for cols in [
         ['msno'], 
         ['msno', 'source_type'], 
         ['msno', 'genre_ids'], 
         ['msno', 'artist_name'], 
         ['msno', 'composer'], 
         ['msno', 'language'], 
         ['song_id']
     ]:
        X[col_name(cols, regression)] = regression(df_history, df, cols)

    for cols in [
        ['msno'], 
        ['msno', 'genre_ids'],
        ['msno', 'composer'], 
        ['msno', 'language'], 
        ['msno','artist_name']
    ]:
        X[col_name(cols, time_from_prev_heard)] = \
            time_from_prev_heard(df_history, df, cols)

    for col in ['song_length', 'bd']:
        X[col] = df[col]
        
    for col in ['expiration_date', 'registration_init_time']:
        X[col] = df[col].apply(lambda x: x.toordinal())
        
    X['part_song_listened'] = df['song_length'] / X['msno_time_to_next_heard'] 
    X['time_from_test_period'] = np.arange(len(df))
    X['part_of_unique_song'] = part_of_unique_song(df)
    
    X['matrix_factorization'] = matrix_factorization(df, df_history)
    
    for i in [500000, 2000000]:
        for cols in [
             ['msno'], 
             ['msno', 'source_type'], 
             ['msno', 'genre_ids'], 
             ['msno', 'artist_name'], 
             ['msno', 'composer'], 
             ['msno', 'language'], 
             ['song_id']
        ]:
            X[col_name(cols, mean) + str(i)] = mean(df_history[-i:], df, cols)
    
    return X

In [None]:
print X.dtpyes

In [None]:
Xtest = create_features(df_test, df_history_test)
Xtrain0 = create_features(df_trains[0], df_history_trains[0])
Xtrain1 = create_features(df_trains[1], df_history_trains[1])

In [None]:
Xtest.to_hdf('data/Xtest.hdf', key='abc')
Xtrain0.to_hdf('data/Xtrain0.hdf', key='abc')
Xtrain1.to_hdf('data/Xtrain1.hdf', key='abc')

df_trains[0].target.to_hdf('data/ytrain0.hdf', key='abc')
df_trains[1].target.to_hdf('data/ytrain1.hdf', key='abc')

# Fitting

In [None]:
import joblib
import xgboost
import catboost
import numpy as np
import pandas as pd

In [3]:
Xtrain0 = pd.read_hdf('data/Xtrain0.hdf')
ytrain0 = pd.read_hdf('data/ytrain0.hdf')
Xtrain1 = pd.read_hdf('data/Xtrain1.hdf')
ytrain1 = pd.read_hdf('data/ytrain1.hdf')
Xtest = pd.read_hdf('data/Xtest.hdf')

In [4]:
model_xgb = xgboost.XGBClassifier(
    learning_rate=0.03, 
    max_depth=7, 
    nthread=50, 
    seed=1, 
    n_estimators=750,
)
model_cb = catboost.CatBoostClassifier(
    iterations=2000, 
    learning_rate=0.03, 
    depth=7, 
    loss_function='Logloss',
    thread_count=50,
    random_seed=1,
)

In [None]:
model_xgb.fit(Xtrain0, ytrain0)
p = model_xgb.predict_proba(Xtest)[:,1]
joblib.dump(p, 'p0_xgb_mf')

model_xgb.fit(Xtrain0.drop('matrix_factorization', axis=1), ytrain0)
p = model_xgb.predict_proba(Xtest.drop('matrix_factorization', axis=1))[:,1]
joblib.dump(p, 'p0_xgb')

model_xgb.fit(Xtrain1, ytrain1)
p = model_xgb.predict_proba(Xtest)[:,1]
joblib.dump(p, 'p1_xgb_mf')

model_xgb.fit(Xtrain1.drop('matrix_factorization', axis=1), ytrain1)
p = model_xgb.predict_proba(Xtest.drop('matrix_factorization', axis=1))[:,1]
joblib.dump(p, 'p1_xgb')

In [None]:
model_cb.fit(Xtrain0, ytrain0)
p = model_cb.predict_proba(Xtest)[:,1]
joblib.dump(p, 'p0_xgb_mf')

model_cb.fit(Xtrain0.drop('matrix_factorization', axis=1), ytrain0)
p = model_cb.predict_proba(Xtest.drop('matrix_factorization', axis=1))[:,1]
joblib.dump(p, 'p0_xgb')

model_cb.fit(Xtrain1, ytrain1)
p = model_cb.predict_proba(Xtest)[:,1]
joblib.dump(p, 'p1_xgb_mf')

model_cb.fit(Xtrain1.drop('matrix_factorization', axis=1), ytrain1)
p = model_cb.predict_proba(Xtest.drop('matrix_factorization', axis=1))[:,1]
joblib.dump(p, 'p1_xgb')

# Blending

In [None]:
import pandas as pd
import numpy as np
import joblib

In [None]:
p0_xgb_mf = joblib.load('p0_xgb_mf')
p0_xgb = joblib.load('p0_xgb')
p1_xgb_mf = joblib.load('p1_xgb_mf')
p1_xgb = joblib.load('p1_xgb')

p0_cb_mf = joblib.load('p0_cb_mf')
p0_cb = joblib.load('p0_cb')
p1_cb_mf = joblib.load('p1_cb_mf')
p1_cb = joblib.load('p1_cb')

In [37]:
p_cb = 0.6 * p0_cb + 0.4 * p1_cb
p_cb_mf = 0.6 * p0_cb_mf + 0.4 * p1_cb_mf
p_xgb = 0.6 * p0_xgb + 0.4 * p1_xgb
p_xgb_mf = 0.6 * p0_xgb_mf + 0.4 * p1_xgb_mf

In [13]:
p_c = 0.6 * p_cb_mf + 0.4 * p_cb
p_x = 0.6 * p_xgb_mf + 0.4 * p_xgb

p = 0.6 * p_c + 0.4 * p_x

In [41]:
sub = pd.DataFrame(p)
sub = sub.reset_index()
sub.columns = ['id', 'target']
sub.to_csv('submission.csv', index=False)