In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import igraph
from sklearn.neighbors import LSHForest
from sklearn.preprocessing import normalize
import random
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import euclidean_distances
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import math
from sklearn.metrics.pairwise import manhattan_distances
from scipy.stats import entropy
import os
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model.logistic import LogisticRegression

%matplotlib notebook



### Parameters

In [17]:
n_latent = 100 # number of latent variables in LDA
n_components = 200 # number of components in SVD
rare_thresh = 1 # if song occurs less than rare_thresh times in train set we replace its score by its genre score
pair_type = 'genre' # media or genre

In [3]:
def get_bow_matrix(data, pair_type):
# return matrix of size n_users*n_medias with cf_mat[i,j]==n iff user i listened to media j n times
# and mappings user_id -> row of matrix, media_id -> column of matrix
    positive_data = data.loc[data['is_listened']==1]
    user_ids = dict([(j,i) for  i,j in enumerate(positive_data['user_id'].unique())])
    media_ids = dict([(j,i) for  i,j in enumerate(positive_data[pair_type+'_id'].unique())])
    n_users = len(user_ids)
    n_medias = len(media_ids)
    bow = sparse.dok_matrix((n_users,n_medias))
    for r in tqdm(positive_data.iterrows()):
        if r[1]['is_listened']:
            bow[user_ids[r[1]['user_id']],media_ids[r[1][pair_type+'_id']]] += 1
    return user_ids, media_ids, bow

In [4]:
def get_tfidf_matrix(bow_matrix):
# return reweighted bow matrix
    # how many users have listened to each song
    media_occurences = np.log(1.+np.array((bow_matrix>0).sum(axis=0))).squeeze()
    # to how many songs has listened each user
    user_occurences = np.array((bow_matrix).sum(axis=1))
    tfidf_matrix = sparse.dok_matrix(bow_matrix.shape)
    for idx, value in tqdm(bow_matrix.items()):
        tfidf_matrix[idx[0],idx[1]] = value / user_occurences[idx[0]]
        tfidf_matrix[idx[0],idx[1]] /= media_occurences[idx[1]]
    return tfidf_matrix

In [5]:
def add_ids(ids, vectors):
# add ids from dictionary 'ids' to matrix 'vectors' as a column
    vectors_ext = np.zeros([vectors.shape[0], vectors.shape[1]+1])
    vectors_ext[:vectors.shape[0],:vectors.shape[1]] = vectors
    for j,i in ids.iteritems():
        vectors_ext[i,vectors.shape[1]] = j
    return vectors_ext

def extract_ids(vectors_ext):
# extract ids to dictionary 'ids' from last column of matrix 'vectors_ext'
    vectors = vectors_ext[:vectors_ext.shape[0],:-1]
    ids = {}
    for i in range(vectors_ext.shape[0]):
        ids[vectors_ext[i,-1]] = i
    return ids, vectors

In [6]:
def histogram_intersection(v1,v2):
    return np.sum(np.min(np.array([v1.squeeze(),v2.squeeze()]), axis=0))
def hellinger_distance(v1,v2):
    return euclidean_distances(np.sqrt(v1),np.sqrt(v2))/math.sqrt(2)

## Loading data

In [7]:
path = os.path.join(os.curdir,'new_user_'+pair_type+'_features')
if 'new_user_'+pair_type+'_features' not in os.listdir(os.curdir):
    os.mkdir(path)

In [8]:
train_data = pd.read_csv('data/train.csv',sep=',', usecols=[pair_type+'_id','user_id','is_listened'])
test_data = pd.read_csv('data/test.csv',sep=',', usecols=['sample_id',pair_type+'_id','user_id'], index_col=0)

## Calculating matrices

In [10]:
user_ids, media_ids, bow_matrix = get_bow_matrix(train_data, pair_type)

5170492it [17:58, 4795.80it/s]


In [11]:
tfidf_matrix = get_tfidf_matrix(bow_matrix)

100%|██████████| 350605/350605 [01:01<00:00, 5746.99it/s]


## Calculating LDA features

In [12]:
lda = LatentDirichletAllocation(n_topics=n_latent, learning_method='batch').fit(bow_matrix)
user_lda_vectors = lda.transform(bow_matrix)
media_lda_vectors = lda.components_.T
user_lda_vectors = normalize(user_lda_vectors, norm='l1')
media_lda_vectors = normalize(media_lda_vectors, norm='l1')

In [13]:
user_lda_vectors_ext = add_ids(user_ids, user_lda_vectors)
with open(os.path.join(path,'user_lda_vectors.npy'),'w') as f:
    np.save(f, user_lda_vectors_ext)
media_lda_vectors_ext = add_ids(media_ids, media_lda_vectors)
with open(os.path.join(path,pair_type+'_lda_vectors.npy'),'w') as f:
    np.save(f, media_lda_vectors_ext)

## Calculating LSI features

In [14]:
user_lsi_vectors,s,vt = sparse.linalg.svds(tfidf_matrix, k=n_components)
media_lsi_vectors = vt.T

In [15]:
user_lsi_vectors_ext = add_ids(user_ids, user_lsi_vectors)
with open(os.path.join(path,'user_lsi_vectors.npy'),'w') as f:
    np.save(f, user_lsi_vectors_ext)
media_lsi_vectors_ext = add_ids(media_ids, media_lsi_vectors)
with open(os.path.join(path,pair_type+'_lsi_vectors.npy'),'w') as f:
    np.save(f, media_lsi_vectors_ext)

## Calculating LDA-based similarity

In [16]:
with open(os.path.join(path,'user_lda_vectors.npy'),'r') as f:
    user_lda_vectors_ext = np.load(f)
user_ids, user_lda_vectors = extract_ids(user_lda_vectors_ext)
with open(os.path.join(path,pair_type+'_lda_vectors.npy'),'r') as f:
    media_lda_vectors_ext = np.load(f)
media_ids, media_lda_vectors = extract_ids(media_lda_vectors_ext)

In [17]:
lda_counter = 0
def lda_similarity(user_id,media_id):
    # return Hellinger similarity between users and medias
    # return 0 for unknown users and medias
    global lda_counter
    lda_counter += 1
    if lda_counter%300000==0:
        print lda_counter
    if user_id in user_ids and media_id in media_ids:
        i = user_ids[user_id]
        j = media_ids[media_id]
        return float(1-hellinger_distance(media_lda_vectors[j,:].reshape([1,-1]),user_lda_vectors[i,:].reshape([1,-1])))
    return 0.

In [18]:
train_data['lda_similarity'] = train_data[['user_id',pair_type+'_id']].apply(lambda rec : lda_similarity(rec['user_id'],rec[pair_type+'_id']), axis=1)

300000
600000
900000
1200000
1500000
1800000
2100000
2400000
2700000
3000000
3300000
3600000
3900000
4200000
4500000
4800000
5100000
5400000
5700000
6000000
6300000
6600000
6900000
7200000
7500000


In [19]:
test_data['lda_similarity'] = test_data[['user_id',pair_type+'_id']].apply(lambda rec : lda_similarity(rec['user_id'],rec[pair_type+'_id']), axis=1)

In [20]:
train_data[['lda_similarity']].to_csv(os.path.join(path,'./train_lda_similarity.csv'))
test_data[['lda_similarity']].to_csv(os.path.join(path,'./test_lda_similarity.csv'))

## Calculating LSI-based similarity

In [21]:
with open(os.path.join(path,'user_lsi_vectors.npy'),'r') as f:
    user_lsi_vectors_ext = np.load(f)
user_ids, user_lsi_vectors = extract_ids(user_lsi_vectors_ext)
with open(os.path.join(path,pair_type+'_lsi_vectors.npy'),'r') as f:
    media_lsi_vectors_ext = np.load(f)
media_ids, media_lsi_vectors = extract_ids(media_lsi_vectors_ext)

In [22]:
lsi_counter = 0
def lsi_similarity(user_id,media_id):
    # return cosine similarity between users and medias
    # return 0 for unknown users and medias
    global lsi_counter
    lsi_counter += 1
    if lsi_counter%300000==0:
        print lsi_counter
    if user_id in user_ids and media_id in media_ids:
        i = user_ids[user_id]
        j = media_ids[media_id]
        return float(cosine_similarity(user_lsi_vectors[i,:].reshape([1,-1]),media_lsi_vectors[j,:].reshape([1,-1])))
    return 0.

In [23]:
train_data['lsi_similarity'] = train_data[['user_id',pair_type+'_id']].apply(lambda rec : lsi_similarity(rec['user_id'],rec[pair_type+'_id']), axis=1)

300000
600000
900000
1200000
1500000
1800000
2100000
2400000
2700000
3000000
3300000
3600000
3900000
4200000
4500000
4800000
5100000
5400000
5700000
6000000
6300000
6600000
6900000
7200000
7500000


In [24]:
test_data['lsi_similarity'] = test_data[['user_id',pair_type+'_id']].apply(lambda rec : lsi_similarity(rec['user_id'],rec[pair_type+'_id']), axis=1)

In [25]:
train_data[['lsi_similarity']].to_csv(os.path.join(path,'./train_lsi_similarity.csv'))
test_data[['lsi_similarity']].to_csv(os.path.join(path,'./test_lsi_similarity.csv'))

## Combining both similarity measures

In [9]:
test_data['lsi_similarity'] = pd.read_csv(os.path.join(path,'./test_lsi_similarity.csv'), index_col=0)
train_data['lsi_similarity'] = pd.read_csv(os.path.join(path,'./train_lsi_similarity.csv'), index_col=0)
test_data['lda_similarity'] = pd.read_csv(os.path.join(path,'./test_lda_similarity.csv'), index_col=0)
train_data['lda_similarity'] = pd.read_csv(os.path.join(path,'./train_lda_similarity.csv'), index_col=0)

In [10]:
# Normalizing scores
lsi_scaler = StandardScaler().fit(train_data['lsi_similarity'].values)
train_data['lsi_similarity'] = lsi_scaler.transform(train_data['lsi_similarity'].values)
test_data['lsi_similarity'] = lsi_scaler.transform(test_data['lsi_similarity'].values)
lda_scaler = StandardScaler().fit(train_data['lda_similarity'].values)
train_data['lda_similarity'] = lda_scaler.transform(train_data['lda_similarity'].values)
test_data['lda_similarity'] = lda_scaler.transform(test_data['lda_similarity'].values)



In [11]:
X_train = train_data[['lda_similarity','lsi_similarity']].values
X_test = test_data[['lda_similarity','lsi_similarity']].values
y_train = train_data['is_listened'].values

In [12]:
clf = LogisticRegression().fit(X_train,y_train)
y_train_pred = clf.predict_proba(X_train)
y_test_pred = clf.predict_proba(X_test)

In [14]:
train_data['mixed_similarity'] = y_train_pred[:,1]
test_data['mixed_similarity'] = y_test_pred[:,1]

In [15]:
train_data[['mixed_similarity']].to_csv(os.path.join(path,'./train_mixed_similarity.csv'))
test_data[['mixed_similarity']].to_csv(os.path.join(path,'./test_mixed_similarity.csv'))

## Replacing user-media similarities of rare songs by user-genre similarities

In [18]:
train_data = pd.read_csv('data/train.csv',sep=',', usecols=['media_id','genre_id','user_id','is_listened'])
test_data = pd.read_csv('data/test.csv',sep=',', usecols=['sample_id','media_id','genre_id','user_id'], index_col=0)
train_data['genre_similarity'] = pd.read_csv('new_user_genre_features/train_mixed_similarity.csv', index_col=0)
test_data['genre_similarity'] = pd.read_csv('new_user_genre_features/test_mixed_similarity.csv', index_col=0)
train_data['media_similarity'] = pd.read_csv('new_user_media_features/train_mixed_similarity.csv', index_col=0)
test_data['media_similarity'] = pd.read_csv('new_user_media_features/test_mixed_similarity.csv', index_col=0)

In [19]:
# how many times each song was listened
media_occurences = train_data[['media_id','is_listened']].groupby('media_id').count().reset_index().rename(columns={'is_listened':'media_occurences'})
# how many times each genre was listened
genre_occurences = train_data[['genre_id','is_listened']].groupby('genre_id').count().reset_index().rename(columns={'is_listened':'genre_occurences'})

In [20]:
test_data = pd.merge(test_data, media_occurences, on='media_id', how='left')
test_data = pd.merge(test_data, genre_occurences, on='genre_id', how='left')
test_data = test_data.fillna(0)
test_data['mixed_similarity'] = test_data['media_similarity'].values

In [21]:
test_data.ix[test_data['media_occurences']<rare_thresh, 'mixed_similarity'] = test_data.ix[test_data['media_occurences']<rare_thresh, 'genre_similarity']

In [22]:
test_data[['mixed_similarity']].to_csv(os.path.join('./test_mixed_media_genre_similarity.csv'))

In [23]:
train_data = pd.merge(train_data, media_occurences, on='media_id', how='left')
train_data = pd.merge(train_data, genre_occurences, on='genre_id', how='left')
train_data = test_data.fillna(0)
train_data['mixed_similarity'] = train_data['media_similarity'].values

In [24]:
train_data.ix[train_data['media_occurences']<rare_thresh, 'mixed_similarity'] = train_data.ix[train_data['media_occurences']<rare_thresh, 'genre_similarity']

In [None]:
train_data[['mixed_similarity']].to_csv(os.path.join('./train_mixed_media_genre_similarity.csv'))