## Read in Data

In [1]:
import numpy as np
import csv

# Predict via the median number of plays.

train_file = 'train.csv'
test_file  = 'test.csv'
soln_file  = 'global_median.csv'

artist_file = "artists.csv"
profile_file = "profiles.csv"

# Load the training data.
train_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = int(row[2])
    
        if not user in train_data:
            train_data[user] = {}
        
        train_data[user][artist] = plays


artist_data = {}        
with open(artist_file, 'r') as artist_fh:
    artist_csv = csv.reader(artist_fh, delimiter=',', quotechar='"')
    next(artist_csv, None)
    for row in artist_csv:
        artist   = row[0]
        name = row[1]
          
        artist_data[artist] = name

## Convert training data to user by artist sparse matrix

In [2]:
user_dict = {user : i for i, user in enumerate(set(train_data.keys()))}
artist_dict = {artist : i for i, artist in enumerate(set(artist_data.keys()))}

In [3]:
len(user_dict)

233286

In [4]:
from scipy.sparse import csr_matrix
data, row, col = [], [] , []

for user, artists in train_data.iteritems():
    
    for artist,plays in artists.iteritems():
        row.append(user_dict[user])
        col.append(artist_dict[artist])
        data.append(plays)
        
play_sp = csr_matrix((data, (row, col)), shape=(len(user_dict), len(artist_dict)))      

In [5]:
play_sp

<233286x2000 sparse matrix of type '<type 'numpy.int64'>'
	with 4154804 stored elements in Compressed Sparse Row format>

In [6]:
user_total = play_sp.sum(axis=1)
user_total_sp=csr_matrix(user_total)
user_total_sp.shape

(233286, 1)

In [8]:
user_total_2000 = np.tile(user_total,(1,2000))
user_total_2000.shape

(233286, 2000)

In [9]:
#np.divide(play_sp, user_total_sp)

In [10]:
user_total=np.squeeze(np.asarray(user_total))
user_total_dict= dict(zip(user_dict.keys(),user_total))

In [11]:
#Truncated SVD -- reduce dimension of the play matrix 
from sklearn.decomposition import TruncatedSVD

svd=TruncatedSVD(n_components=20)
play_svd = svd.fit_transform(play_sp)
print np.sum(svd.explained_variance_ratio_)
print svd.explained_variance_ratio_

0.340944090875
[ 0.07628592  0.0390799   0.02300729  0.01844086  0.01611143  0.01477306
  0.014048    0.01367924  0.0133884   0.01222386  0.01084338  0.01093678
  0.01080966  0.01058332  0.01000298  0.00981957  0.00961562  0.00921103
  0.00914507  0.00893871]


In [12]:
play_svd_sp=csr_matrix(play_svd)

## Convert profile data to sparse matrix

In [13]:
import pandas as pd
profile_data = pd.read_csv("profiles.csv")

In [14]:
profile_data.head(10)

Unnamed: 0,user,sex,age,country
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25.0,Sweden
1,5909125332c108365a26ccf0ee62636eee08215c,m,29.0,Iceland
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30.0,United States
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21.0,Germany
4,02871cd952d607ba69b64e2e107773012c708113,m,24.0,Netherlands
5,0938eb3d1b449b480c4e2431c457f6ead7063a34,m,22.0,United States
6,e4c6b36e65db3d48474dd538fe74d2dbb5a2e79e,f,,United States
7,b97479f9a563a5c43b423a976f51fd509e1ec5ba,f,,Poland
8,3bb020df0ff376dfdded4d5e63e2d35a50b3c535,m,,United States
9,f3fb86c0f024f640cae3fb479f3a27e0dd499891,,16.0,Ukraine


In [15]:
profile_data.describe()

Unnamed: 0,age
count,188444.0
mean,24.5174
std,21.853296
min,-1337.0
25%,20.0
50%,23.0
75%,27.0
max,1002.0


In [16]:
country_dict = {'Afghanistan': 'Asia',
 'Albania': 'Europe',
 'Algeria': 'Africa',
 'Andorra': 'Europe',
 'Angola': 'Africa',
 'Antigua and Barbuda': 'North America',
 'Argentina': 'South America',
 'Armenia': 'Asia',
 'Australia': 'Oceania',
 'Austria': 'Europe',
 'Azerbaijan': 'Asia',
 'Bahamas': 'North America',
 'Bahrain': 'Asia',
 'Bangladesh': 'Asia',
 'Barbados': 'North America',
 'Belarus': 'Europe',
 'Belgium': 'Europe',
 'Belize': 'North America',
 'Benin': 'Africa',
 'Bhutan': 'Asia',
 'Bolivia': 'South America',
 'Bosnia and Herzegovina': 'Europe',
 'Botswana': 'Africa',
 'Brazil': 'South America',
 'Brunei Darussalam': 'Asia',
 'Bulgaria': 'Europe',
 'Burkina Faso': 'Africa',
 'Burundi': 'Africa',
 'Cambodia': 'Asia',
 'Cameroon': 'Africa',
 'Canada': 'North America',
 'Cape Verde': 'Africa',
 'Central African Republic': 'Africa',
 'Chad': 'Africa',
 'Chile': 'South America',
 'Colombia': 'South America',
 'Comoros': 'Africa',
 'Costa Rica': 'North America',
 'Croatia': 'Europe',
 'Cuba': 'North America',
 'Cyprus': 'Asia',
 'Czech Republic': 'Europe',
 "C\xc3\xb4te d'Ivoire": 'Africa',
 'Democratic Republic of the Congo': 'Africa',
 'Denmark': 'Europe',
 'Djibouti': 'Africa',
 'Dominica': 'North America',
 'Dominican Republic': 'North America',
 'East Timor': 'Asia',
 'Ecuador': 'South America',
 'Egypt': 'Africa',
 'El Salvador': 'North America',
 'Equatorial Guinea': 'Africa',
 'Eritrea': 'Africa',
 'Estonia': 'Europe',
 'Ethiopia': 'Africa',
 'Federated States of Micronesia': 'Oceania',
 'Fiji': 'Oceania',
 'Finland': 'Europe',
 'France': 'Europe',
 'Gabon': 'Africa',
 'Georgia': 'Asia',
 'Germany': 'Europe',
 'Ghana': 'Africa',
 'Greece': 'Europe',
 'Grenada': 'North America',
 'Guatemala': 'North America',
 'Guinea': 'Africa',
 'Guinea-Bissau': 'Africa',
 'Guyana': 'South America',
 'Haiti': 'North America',
 'Honduras': 'North America',
 'Hungary': 'Europe',
 'Iceland': 'Europe',
 'India': 'Asia',
 'Indonesia': 'Asia',
 'Iran': 'Asia',
 'Iraq': 'Asia',
 'Israel': 'Asia',
 'Italy': 'Europe',
 'Jamaica': 'North America',
 'Japan': 'Asia',
 'Jordan': 'Asia',
 'Kazakhstan': 'Asia',
 'Kenya': 'Africa',
 'Kingdom of the Netherlands': 'Europe',
 'Kiribati': 'Oceania',
 'Kuwait': 'Asia',
 'Kyrgyzstan': 'Asia',
 'Laos': 'Asia',
 'Latvia': 'Europe',
 'Lebanon': 'Asia',
 'Lesotho': 'Africa',
 'Liberia': 'Africa',
 'Libya': 'Africa',
 'Liechtenstein': 'Europe',
 'Lithuania': 'Europe',
 'Luxembourg': 'Europe',
 'Macedonia': 'Europe',
 'Madagascar': 'Africa',
 'Malawi': 'Africa',
 'Malaysia': 'Asia',
 'Maldives': 'Asia',
 'Mali': 'Africa',
 'Malta': 'Europe',
 'Marshall Islands': 'Oceania',
 'Mauritania': 'Africa',
 'Mauritius': 'Africa',
 'Mexico': 'North America',
 'Moldova': 'Europe',
 'Monaco': 'Europe',
 'Mongolia': 'Asia',
 'Montenegro': 'Europe',
 'Morocco': 'Africa',
 'Mozambique': 'Africa',
 'Myanmar': 'Asia',
 'Namibia': 'Africa',
 'Nauru': 'Oceania',
 'Nepal': 'Asia',
 'New Zealand': 'Oceania',
 'Nicaragua': 'North America',
 'Niger': 'Africa',
 'Nigeria': 'Africa',
 'North Korea': 'Asia',
 'Norway': 'Europe',
 'Oman': 'Asia',
 'Pakistan': 'Asia',
 'Palau': 'Oceania',
 'Panama': 'North America',
 'Papua New Guinea': 'Oceania',
 'Paraguay': 'South America',
 "People's Republic of China": 'Asia',
 'Peru': 'South America',
 'Philippines': 'Asia',
 'Poland': 'Europe',
 'Portugal': 'Europe',
 'Qatar': 'Asia',
 'Republic of Ireland': 'Europe',
 'Republic of the Congo': 'Africa',
 'Romania': 'Europe',
 'Russia': 'Europe',
 'Rwanda': 'Africa',
 'Saint Kitts and Nevis': 'North America',
 'Saint Lucia': 'North America',
 'Saint Vincent and the Grenadines': 'North America',
 'Samoa': 'Oceania',
 'San Marino': 'Europe',
 'Saudi Arabia': 'Asia',
 'Senegal': 'Africa',
 'Serbia': 'Europe',
 'Seychelles': 'Africa',
 'Sierra Leone': 'Africa',
 'Singapore': 'Asia',
 'Slovakia': 'Europe',
 'Slovenia': 'Europe',
 'Solomon Islands': 'Oceania',
 'Somalia': 'Africa',
 'South Africa': 'Africa',
 'South Korea': 'Asia',
 'Spain': 'Europe',
 'Sri Lanka': 'Asia',
 'Sudan': 'Africa',
 'Suriname': 'South America',
 'Swaziland': 'Africa',
 'Sweden': 'Europe',
 'Switzerland': 'Europe',
 'Syria': 'Asia',
 'S\xc3\xa3o Tom\xc3\xa9 and Pr\xc3\xadncipe': 'Africa',
 'Tajikistan': 'Asia',
 'Tanzania': 'Africa',
 'Thailand': 'Asia',
 'The Gambia': 'Africa',
 'Togo': 'Africa',
 'Tonga': 'Oceania',
 'Trinidad and Tobago': 'North America',
 'Tunisia': 'Africa',
 'Turkey': 'Asia',
 'Turkmenistan': 'Asia',
 'Tuvalu': 'Oceania',
 'Uganda': 'Africa',
 'Ukraine': 'Europe',
 'United Arab Emirates': 'Asia',
 'United Kingdom': 'Europe',
 'United States': 'North America',
 'Uruguay': 'South America',
 'Uzbekistan': 'Asia',
 'Vanuatu': 'Oceania',
 'Vatican City': 'Europe',
 'Venezuela': 'South America',
 'Vietnam': 'Asia',
 'Yemen': 'Asia',
 'Zambia': 'Africa',
 'Zimbabwe': 'Africa'}

In [17]:
continent = []
for i in profile_data['country']:
    if i in country_dict.keys():
        continent.append(country_dict[i])
    else:
        continent.append(i)


In [18]:
profile_data['continent'] = continent

In [19]:
print "The number of females:" , np.sum(profile_data.sex=="f")
print "The number of males:" , np.sum(profile_data.sex=="m")
probf=np.sum(profile_data.sex=="f")/np.float((np.sum(profile_data.sex=="m")+np.sum(profile_data.sex=="f")))
print "Proportional of females:", probf

The number of females: 59391
The number of males: 154360
Proportional of females: 0.27785133169


In [20]:
profile_data.isnull().sum()

user             0
sex          19535
age          44842
country          0
continent        0
dtype: int64

In [21]:
profile_data.shape

(233286, 5)

In [22]:
profile_data['sex']=profile_data['sex'].fillna("missing")

In [23]:
def imputegender(row):
   
    if row == "missing":
        return np.random.choice([0,1],p=[1-probf,probf])
    elif row == "f":
        return 1
    elif row == "m":
        return 0
    
profile_data['female']= profile_data['sex'].apply(lambda x: imputegender(x))
profile_data.head(10)

Unnamed: 0,user,sex,age,country,continent,female
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25.0,Sweden,Europe,1
1,5909125332c108365a26ccf0ee62636eee08215c,m,29.0,Iceland,Europe,0
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30.0,United States,North America,0
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21.0,Germany,Europe,0
4,02871cd952d607ba69b64e2e107773012c708113,m,24.0,Netherlands,Netherlands,0
5,0938eb3d1b449b480c4e2431c457f6ead7063a34,m,22.0,United States,North America,0
6,e4c6b36e65db3d48474dd538fe74d2dbb5a2e79e,f,,United States,North America,1
7,b97479f9a563a5c43b423a976f51fd509e1ec5ba,f,,Poland,Europe,1
8,3bb020df0ff376dfdded4d5e63e2d35a50b3c535,m,,United States,North America,0
9,f3fb86c0f024f640cae3fb479f3a27e0dd499891,missing,16.0,Ukraine,Europe,0


In [24]:
female_age = round(profile_data[profile_data.female==1].age.mean())
male_age = round(profile_data[profile_data.female==0].age.mean())

print "Females avg. age:" , round(female_age)
print "Males avg. age:" , round(male_age)
# probf=np.sum(profile_data.sex=="f")/np.float((np.sum(profile_data.sex=="m")+np.sum(profile_data.sex=="f")))
# print "Proportional of females:", probf

Females avg. age: 23.0
Males avg. age: 25.0


In [25]:

def imputeage(row):
   
    if row['age'] == "missing":
        if row['female']==1:
            return female_age
        else:
            return male_age
    else:
        return row['age']

profile_data['age']=profile_data['age'].fillna("missing")
profile_data['age2']= profile_data.apply(lambda x: imputeage(x),axis=1)
profile_data.head(10)

Unnamed: 0,user,sex,age,country,continent,female,age2
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25,Sweden,Europe,1,25
1,5909125332c108365a26ccf0ee62636eee08215c,m,29,Iceland,Europe,0,29
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30,United States,North America,0,30
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21,Germany,Europe,0,21
4,02871cd952d607ba69b64e2e107773012c708113,m,24,Netherlands,Netherlands,0,24
5,0938eb3d1b449b480c4e2431c457f6ead7063a34,m,22,United States,North America,0,22
6,e4c6b36e65db3d48474dd538fe74d2dbb5a2e79e,f,missing,United States,North America,1,23
7,b97479f9a563a5c43b423a976f51fd509e1ec5ba,f,missing,Poland,Europe,1,23
8,3bb020df0ff376dfdded4d5e63e2d35a50b3c535,m,missing,United States,North America,0,25
9,f3fb86c0f024f640cae3fb479f3a27e0dd499891,missing,16,Ukraine,Europe,0,16


In [26]:
continent_dict ={}
i=0
for continent in profile_data.continent.unique():
    continent_dict[continent]=i
    i=i+1
    
profile_data['continentidx'] = profile_data['continent'].apply(lambda x: continent_dict[x])
profile_data.head()

Unnamed: 0,user,sex,age,country,continent,female,age2,continentidx
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25,Sweden,Europe,1,25,0
1,5909125332c108365a26ccf0ee62636eee08215c,m,29,Iceland,Europe,0,29,0
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30,United States,North America,0,30,1
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21,Germany,Europe,0,21,0
4,02871cd952d607ba69b64e2e107773012c708113,m,24,Netherlands,Netherlands,0,24,2


In [27]:
countrydict ={}
i=0
for country in profile_data.country.unique():
    countrydict[country]=i
    i=i+1
    
profile_data['countryidx'] = profile_data['country'].apply(lambda x: countrydict[x])
profile_data.head()

Unnamed: 0,user,sex,age,country,continent,female,age2,continentidx,countryidx
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25,Sweden,Europe,1,25,0,0
1,5909125332c108365a26ccf0ee62636eee08215c,m,29,Iceland,Europe,0,29,0,1
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30,United States,North America,0,30,1,2
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21,Germany,Europe,0,21,0,3
4,02871cd952d607ba69b64e2e107773012c708113,m,24,Netherlands,Netherlands,0,24,2,4


In [28]:
len(countrydict)

239

In [29]:
data, row, col = [], [] , []

for idx, dfrow in profile_data.iterrows():
    row.append(user_dict[dfrow['user']])
    #col.append(countrydict[dfrow['country']])
    col.append(continent_dict[dfrow['continent']])
    data.append(1)
        
#country_sp = csr_matrix((data, (row, col)), shape=(len(user_dict), len(countrydict)))      
continent_sp = csr_matrix((data, (row, col)), shape=(len(user_dict), len(continent_dict)))      

In [30]:
import scipy.sparse as sc
other_sp=csr_matrix(profile_data[['age2','female']].values)
#profile_sp=sc.hstack([country_sp,other_sp])
profile_sp=sc.hstack([continent_sp,other_sp])
profile_sp

<233286x75 sparse matrix of type '<type 'numpy.float64'>'
	with 531396 stored elements in COOrdinate format>

In [31]:
main_sp = sc.hstack([play_svd_sp,profile_sp,user_total_sp])

## Model fitting

In [374]:
from sklearn.cluster import KMeans
random_state=123
class_pred_100 = KMeans(n_clusters=100, random_state=random_state).fit_predict(main_sp)

## Prediction

In [164]:
data, row, col = [], [] , []

for user, artists in train_data.iteritems():
    
    for artist,plays in artists.iteritems():
        row.append(user_dict[user])
        col.append(artist_dict[artist])
        data.append(plays)
        
play_sp_col = sc.csc_matrix((data, (row, col)), shape=(len(user_dict), len(artist_dict)))      

In [235]:
data, row, col = [], [] , []

for user, artists in train_data.iteritems():
    
    for artist,plays in artists.iteritems():
        row.append(user_dict[user])
        col.append(artist_dict[artist])
        data.append(1)
        
play_sp_ind = sc.csr_matrix((data, (row, col)), shape=(len(user_dict), len(artist_dict)))      

In [264]:
artist_sum =np.squeeze(np.asarray(play_sp_col.sum(axis=0)))
artist_n = np.squeeze(np.asarray(play_sp_ind.tocsc().sum(axis=0))).astype(float)
artist_global = np.divide(artist_sum,np.squeeze(np.asarray(artist_n)))
#np.median(user_total)
#artist_prop 
artist_global

array([ 217.89846517,  326.67612524,  207.06736527, ...,  259.93282636,
        206.13963039,  173.58415842])

In [51]:
plays_array  = []
user_medians = {}
for user, user_data in train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

In [385]:
artist_dist = np.zeros((10,2000))
artist_mean = np.zeros((10,2000))
for i in np.arange(10):
    indices = np.where(class_pred_100==i)[0]
    slice_sp = play_sp[indices,:].tocsc()
    slice_sp_ind =play_sp_ind[indices,:].tocsc()
    artist_sum =np.squeeze(np.asarray(slice_sp.sum(axis=0)))
    artist_n = np.squeeze(np.asarray(slice_sp_ind.sum(axis=0))).astype(float)
    artist_mean[i] = np.divide(artist_sum,np.squeeze(np.asarray(artist_n)))
    artist_dist[i] = artist_sum/np.float(np.sum(artist_sum))

In [386]:
user_class={}
users=user_dict.keys()
for idx,c in enumerate(class_pred):
    user_class[users[idx]]=c

In [32]:
#
test_data =pd.read_csv('test.csv')

In [276]:
def computeplay(row):
    c = user_class[row['user']]
    total = user_total_dict[row['user']]
    a=artist_dict[row['artist']]
    return artist_mean[c,a]

test_data['plays'] = test_data.apply(lambda r:computeplay(r),axis=1)

In [288]:
user_total_dict['fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d']

1723

In [387]:
def computeplay2(row):
    c = user_class[row['user']]
    total = user_total_dict[row['user']]
    prob = artist_dist[c,a]
    return total*prob

test_data['plays'] = test_data.apply(lambda r:computeplay2(r),axis=1)

In [379]:
test_data.head()

Unnamed: 0,Id,user,artist,plays,play
0,1,306e19cce2522fa2d39ff5dfc870992100ec22d2,4ac4e32b-bd18-402e-adad-ae00e72f8d85,4.760336,0
1,2,9450d351278df4938bdea4ed86aec940a4e927ac,1f574ab1-a46d-4586-9331-f0ded23e0411,25.873998,0
2,3,801909d6955f59033c88595d3d7f8a6a5dcd53cc,3eb72791-6322-466b-87d3-24d74901eb2d,9.117049,0
3,4,e3ed47445c127fbeff47fb58f6bbf2f3b4535d82,61604b45-8a91-4e33-a1b6-45d7b1fec4e5,0.792873,261
4,5,a73f46652103f3a5f7429159310f6928f79644aa,5dfdca28-9ddc-4853-933c-8bc97d87beec,4.136711,147


In [388]:
test_data[['Id','plays']].to_csv('k-means_100.csv',index=False)

In [373]:
train_df = pd.read_csv("train.csv")
train_df['plays2'] = train_df.apply(lambda r:computeplay2(r),axis=1)

In [381]:
diff = train_df['plays2'].values - train_df['plays'].values
np.sum(np.absolute(diff))

1007584291.0903144

In [380]:
train_df['play2_100'] = train_df.apply(lambda r:computeplay2(r),axis=1)

In [384]:
diff = train_df['play2_100'].values - train_df['plays'].values
np.sum(np.absolute(diff))

1019015325.8450176

In [389]:
train_df['play_100'] = train_df.apply(lambda r:computeplay2(r),axis=1)
diff = train_df['play_100'].values - train_df['plays'].values
np.sum(np.absolute(diff))

1028051122.8202177

In [214]:

# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_medians:
                c = user_class[user]
                total = user_total_dict[user]
                a=artist_dict[artist]
                prob = artist_dist[c,a]
                soln_csv.writerow([id, total*prob])
            else:
                print "User", id, "not in training data."
                soln_csv.writerow([id, global_median])
                


## Based on similarity 

In [33]:
play_sp_t = play_sp.transpose()
artist_similarity = play_sp_t.dot(play_sp).toarray()
artist_similarity

array([[114837328,     13860,    158391, ...,         0,    104093,
           185979],
       [    13860, 402207029,    355887, ...,     28851,   1150422,
           762187],
       [   158391,    355887, 108129153, ...,     81120,    924809,
           844173],
       ..., 
       [        0,     28851,     81120, ..., 285648317,     47400,
            53330],
       [   104093,   1150422,    924809, ...,     47400,  70696840,
           128657],
       [   185979,    762187,    844173, ...,     53330,    128657,
        280860028]])

In [34]:
# squared magnitude of preference vectors (number of occurrences)
square_mag = np.diag(artist_similarity)

# inverse squared magnitude
inv_square_mag = 1 / square_mag.astype(float)

# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[np.isinf(inv_square_mag)] = 0

# inverse of the magnitude
inv_mag = np.sqrt(inv_square_mag)

# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = artist_similarity * inv_mag
cosine = cosine.T * inv_mag

In [35]:
artist_similarity_df = pd.DataFrame(cosine, columns=(artist_dict.keys()), index=(artist_dict.keys()))
artist_similarity_df.head()

Unnamed: 0,03098741-08b3-4dd7-b3f6-1b0bfa2c879c,69c4cc43-8163-41c5-ac81-30946d27bb69,7a2e6b55-f149-4e74-be6a-30a1b1a387bb,7002bf88-1269-4965-a772-4ba1e7a91eaa,dbf7c761-e332-467b-b4d9-aafe06bbcf8f,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,8b0f05ce-354e-4121-9e0b-8b4732ea844f,8363f94f-fd86-41b8-a56b-26eacb34f499,2e41ae9c-afd2-4f20-8f1e-17281ce9b472,c17f08f4-2542-46fb-97f3-3202d60c225a,...,8067c102-4996-42bc-9980-06ce2e644eae,ac9a487a-d9d2-4f27-bb23-0f4686488345,69b39eab-6577-46a4-a9f5-817839092033,45be8e20-3c3c-4e78-98f0-b5bdecf2f703,e0953daa-860f-4dc8-9f1a-b12587cdaf17,5a8688d2-696f-4060-9dd0-f772886ff95b,82a5b152-ee60-4447-939a-dd5a91cd7c38,8974da95-e631-45aa-8fd7-aa0c2795f997,ab7ebf8c-059f-4071-93b1-dd3ae80d60b2,39c2a93d-9afa-4a22-9bba-c087ab056e1c
03098741-08b3-4dd7-b3f6-1b0bfa2c879c,1.0,6.4e-05,0.001421,1e-06,1e-05,0.000173,1.1e-05,0.0,0.0,0.000602,...,0.0,0.001706,0.000685,0.001466,0.013526,0.0,0.004038,0.0,0.001155,0.001036
69c4cc43-8163-41c5-ac81-30946d27bb69,6.4e-05,1.0,0.001707,0.0,0.000337,0.001222,0.000176,0.0,0.000175,0.000604,...,0.000154,0.006361,0.000747,0.001111,0.000425,1.291785e-07,0.000424,8.5e-05,0.006822,0.002268
7a2e6b55-f149-4e74-be6a-30a1b1a387bb,0.001421,0.001707,1.0,0.000377,0.000577,0.000534,0.0,3.8e-05,0.000107,0.000355,...,0.0,0.000358,0.005412,0.005083,0.002435,0.0,0.001725,0.000462,0.010577,0.004844
7002bf88-1269-4965-a772-4ba1e7a91eaa,1e-06,0.0,0.000377,1.0,5.3e-05,0.005327,0.001776,2.6e-05,7.8e-05,0.0,...,0.0,5e-05,9.3e-05,0.001525,0.003673,0.0002847327,0.001545,5.3e-05,0.000144,0.000628
dbf7c761-e332-467b-b4d9-aafe06bbcf8f,1e-05,0.000337,0.000577,5.3e-05,1.0,0.003229,0.000305,0.0,0.000207,0.0,...,5.4e-05,0.000287,0.00082,0.005902,0.000305,0.0,0.001361,0.0,0.002912,0.001757


In [36]:
artist_similarity_df.index.is_unique # check if there is no repeated artists

True

In [70]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=30)
neigh.fit(artist_similarity_df) # Fit the data

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=30, p=2, radius=1.0)

In [71]:
model = pd.DataFrame(neigh.kneighbors(artist_similarity_df, return_distance=False))
model.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0,265,1371,652,1744,1259,1030,1144,660,1201,...,390,269,947,1152,260,1130,767,1167,569,1488
1,1,1570,1910,1532,1942,1066,1810,989,1206,654,...,1149,514,1192,33,1860,1246,1128,155,1544,1989
2,2,1213,37,676,764,301,1761,745,635,1190,...,17,1082,1766,209,1180,1694,1921,158,770,432
3,3,1544,1232,666,133,1927,1284,844,1107,105,...,1228,519,217,250,1299,1586,796,161,1805,747
4,4,1926,532,112,1769,1190,1929,1251,406,287,...,951,666,164,460,547,809,1324,808,1942,732


In [72]:
final_model = pd.DataFrame(artist_similarity_df.columns[model], index=artist_similarity_df.index)#give name with respect to artist
final_model.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
03098741-08b3-4dd7-b3f6-1b0bfa2c879c,03098741-08b3-4dd7-b3f6-1b0bfa2c879c,a4f54716-24d9-4c6b-8ba7-1d7e7a5173d0,cfbc0924-0035-4d6c-8197-f024653af823,fc61dd75-880b-44ba-9ba9-c7b643d33413,55821c25-bd47-4f67-baf8-17ccd2cf1c86,4c730f5d-a27c-49fc-b505-23c2c61a6840,ac1dd50d-9f23-4c9a-8fc7-f2d6b03a5c5d,bbd80354-597e-4d53-94e4-92b3a7cb8f2c,6aa40207-fec8-43a7-991d-b872a42def05,8e05a404-3f8d-4b0a-9fc2-b7ab821b75f0,...,c4caa34e-57b5-432b-ad9d-7e6543944d96,321531fc-db73-4ffa-a959-61a61a2908c1,f75150f0-d80d-4c34-903b-6acfb47b63ee,847e8284-8582-4b0e-9c26-b042a4f49e57,1946a82a-f927-40c2-8235-38d64f50d043,265f242e-cf4e-4fbe-a3fe-43112387172f,a16371b9-7d36-497a-a9d4-42b0a0440c5e,a3a92047-be1c-4f3e-8960-c4f8570984df,183105b5-3e68-4748-9086-2c1c11bf7a3d,bbc5b66b-d037-4f26-aecf-0b129e7f876a
69c4cc43-8163-41c5-ac81-30946d27bb69,69c4cc43-8163-41c5-ac81-30946d27bb69,c7732cf8-e9a1-46f1-9133-84d930f3187a,ef4db186-ff43-4708-a713-3ce1e05657a1,3630fff3-52fc-4e97-ab01-d68fd88e4135,9b21f670-8359-4e11-be1d-bf75b649a719,592a3b6d-c42b-4567-99c9-ecf63bd66499,cbcbb22c-3a8d-46af-b4ba-09c98f0d7931,149326c2-b8a3-48e3-b3d2-9b5b9593127f,b5da400c-9a62-4686-b6fe-91518e57ce5d,03172286-f7ed-4864-a4db-459cd5ca9790,...,5c2717ab-3b1e-4043-8874-e88ecf391cc9,ed114045-8462-4b49-89ae-527fa2544a2b,1b15e90d-910c-4be4-99cb-463772a6430f,80b3cf5e-18fe-4c59-98c7-e5bb87210710,a47c3aa2-7d87-475c-a2c7-1e2047dafb09,9854d99f-d954-4c0f-9ae5-58a0cdd885f5,f82f3a3e-29c2-42ca-b589-bc5dc210fa9e,f0e820ab-f31a-400d-9c5e-8be1c8c38726,5fee3020-513b-48c2-b1f7-4681b01db0c6,349fd7a7-183b-4139-a4ee-1d9b7146f8a1
7a2e6b55-f149-4e74-be6a-30a1b1a387bb,7a2e6b55-f149-4e74-be6a-30a1b1a387bb,8eed05a5-e9a1-4dda-8b33-e354c4ecc8b6,4cbbf0d6-6c78-4e3d-ab25-91b13603cf7c,2339bc21-aa92-4850-86f0-4bb9433910c8,38c7a3b2-17fb-4166-b964-4e445d69efdd,4f8b7186-b2a2-40db-97ae-6e1cd46d57b1,1c5efd53-d6b6-4d63-9d22-a15025cf5f07,19516266-e5d9-4774-b749-812bb76a6559,7928481f-848e-4551-b658-472c0aaf0c85,6e0c7c0e-cba5-4c2c-a652-38f71ef5785d,...,756cf672-d4ae-4470-a3af-a43d776a211d,328d146c-79f1-4eb6-9e40-8ee5710c14e5,8f7bf43e-2a13-431a-a274-19e28965fe07,85226286-349a-4046-8dcf-f75b32baa6b1,d3b2711f-2baa-441a-be95-14945ca7e6ea,2f569e60-0a1b-4fb9-95a4-3dc1525d1aad,0cdb0359-5698-487d-9aae-a25fb4dcdc4d,0741b30d-e15b-4a8c-b2e5-8834a03d6116,d8661c02-f423-4d72-8044-40ff05daf7a1,78f797e3-4913-4026-aad0-1cd858bd735b
7002bf88-1269-4965-a772-4ba1e7a91eaa,7002bf88-1269-4965-a772-4ba1e7a91eaa,5fee3020-513b-48c2-b1f7-4681b01db0c6,d700b3f5-45af-4d02-95ed-57d301bda93e,fa927f59-d443-418a-b741-e557208aaf09,6cbe1e63-5895-4168-ac7e-f0d2836ba0c1,061c4920-3ea6-4835-98f6-02f3b82f5e3a,020bfbb4-05c3-4c86-b372-17825c262094,95e1ead9-4d31-4808-a7ac-32c3614c116b,298909e4-ebcb-47b8-95e9-cc53b087fc65,b9472588-93f3-4922-a1a2-74082cdf9ce8,...,c3477250-bc5f-44e9-aa0a-144dd2d7a935,9047fd95-770f-4a8a-a6e3-3b071a882e4d,d3b2bec4-b70e-460e-b433-a865ceac2de8,a4a3048f-3968-4848-9f53-94e3d4f88b53,a10f94a9-2a3c-4166-b9fa-a7bed0bf7d96,d2a79d20-1304-47fd-a998-b4fe1ec78373,23b58ea2-1cc3-468a-9e76-5027e196e851,b88ca659-0393-4a62-abd8-f290e6c7a7e2,1eeaf46d-4991-4e95-8068-927627ccfc18,6aa55ee1-4c46-4f7c-9a6d-5051dfe2c06d
dbf7c761-e332-467b-b4d9-aafe06bbcf8f,dbf7c761-e332-467b-b4d9-aafe06bbcf8f,7113aab7-628f-4050-ae49-dbecac110ca8,04cd0cfd-bfd1-4c36-bc38-95c35e2c045f,d4ad0149-d8ae-4105-8009-0221fce9ff35,215c6ab2-7888-4061-bd56-9fb650328106,6e0c7c0e-cba5-4c2c-a652-38f71ef5785d,e01646f2-2a04-450d-8bf2-0d993082e058,9beb62b2-88db-4cea-801e-162cd344ee53,8925a01a-8608-411d-a90b-f3a52d061208,42edf21b-2dda-4382-aa6b-49b6483bcf73,...,aaf09f31-bb5c-43e5-9f54-bb6554c33a71,fa927f59-d443-418a-b741-e557208aaf09,23228f18-01d5-493e-94ce-cfcde82a8db2,cbfb9bcd-c5a0-4d7c-865f-2c641c171e1c,906bddec-bc73-49f8-ac1e-eaee691c6cf9,477b8c0c-c5fc-4ad2-b5b2-191f0bf2a9df,ff460a70-fdf4-4aa2-b021-8a04da76d88e,7f36c2ba-a338-402f-861f-390229bfae04,9b21f670-8359-4e11-be1d-bf75b649a719,f81f19b9-c76e-43ac-8656-bb56071785fb


In [42]:
top100 = final_model[list(final_model.columns[:101])]
top100.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
03098741-08b3-4dd7-b3f6-1b0bfa2c879c,03098741-08b3-4dd7-b3f6-1b0bfa2c879c,a4f54716-24d9-4c6b-8ba7-1d7e7a5173d0,cfbc0924-0035-4d6c-8197-f024653af823,fc61dd75-880b-44ba-9ba9-c7b643d33413,55821c25-bd47-4f67-baf8-17ccd2cf1c86,4c730f5d-a27c-49fc-b505-23c2c61a6840,ac1dd50d-9f23-4c9a-8fc7-f2d6b03a5c5d,bbd80354-597e-4d53-94e4-92b3a7cb8f2c,6aa40207-fec8-43a7-991d-b872a42def05,8e05a404-3f8d-4b0a-9fc2-b7ab821b75f0,...,e5add03e-93c4-4f5a-9ffd-ec93f24d8c5e,f3bf61f8-97d4-4e52-a73d-2ddbbe8196c8,2e04a82f-9118-43b1-a918-bf84f122f7fc,aaae25bf-c794-45d7-bab7-cdc7c655bfa6,337e30b6-3dbe-44f2-a3fd-84a80abdb5a1,f97b55a8-7d03-4ebe-a277-a9cadc2a9f86,7f2a8098-50e3-451c-8696-c5d9f64f4762,3a9f3479-4f0c-4355-8ccd-c9ed5cd01586,50698127-f6f4-4c0c-8010-db98c21100e1,fc7376fe-1a6f-4414-b4a7-83f50ed59c92
69c4cc43-8163-41c5-ac81-30946d27bb69,69c4cc43-8163-41c5-ac81-30946d27bb69,c7732cf8-e9a1-46f1-9133-84d930f3187a,ef4db186-ff43-4708-a713-3ce1e05657a1,3630fff3-52fc-4e97-ab01-d68fd88e4135,9b21f670-8359-4e11-be1d-bf75b649a719,592a3b6d-c42b-4567-99c9-ecf63bd66499,cbcbb22c-3a8d-46af-b4ba-09c98f0d7931,149326c2-b8a3-48e3-b3d2-9b5b9593127f,b5da400c-9a62-4686-b6fe-91518e57ce5d,03172286-f7ed-4864-a4db-459cd5ca9790,...,a342964d-ca53-4e54-96dc-e8501851e77f,057aa66c-cf6e-499d-bda8-5adc47ad4197,509f20b2-5df3-4aec-9bbc-002131fb3f99,c2f28620-9372-4af4-b961-cc4b9e44d60d,90cc2464-234e-4da0-b39b-576f36e633bc,401c3991-b76b-499d-8082-9f2df958ef78,c3477250-bc5f-44e9-aa0a-144dd2d7a935,ab7ebf8c-059f-4071-93b1-dd3ae80d60b2,ad79836d-9849-44df-8789-180bbc823f3c,0a77bec1-12ef-4caa-b36a-f533001fcd29
7a2e6b55-f149-4e74-be6a-30a1b1a387bb,7a2e6b55-f149-4e74-be6a-30a1b1a387bb,8eed05a5-e9a1-4dda-8b33-e354c4ecc8b6,4cbbf0d6-6c78-4e3d-ab25-91b13603cf7c,2339bc21-aa92-4850-86f0-4bb9433910c8,38c7a3b2-17fb-4166-b964-4e445d69efdd,4f8b7186-b2a2-40db-97ae-6e1cd46d57b1,1c5efd53-d6b6-4d63-9d22-a15025cf5f07,19516266-e5d9-4774-b749-812bb76a6559,7928481f-848e-4551-b658-472c0aaf0c85,6e0c7c0e-cba5-4c2c-a652-38f71ef5785d,...,843384a8-2fb5-4d21-91d6-3f4b545fca7f,7fbfcd25-9ce2-4ef4-9270-e971ea61fb4a,fa927f59-d443-418a-b741-e557208aaf09,ca39d50f-9885-420e-88d6-9c3f64038773,6c8fd0be-d961-454c-aee0-4366a6dbc993,9d1ebcfe-4c15-4d18-95d3-d919898638a1,d6bd72bc-b1e2-4525-92aa-0f853cbb41bf,e9e2a634-984f-4c10-bf7b-7970179e1ef1,2ba31cc4-df9e-4c21-9dbc-bb8fa9af424f,1eeaf46d-4991-4e95-8068-927627ccfc18
7002bf88-1269-4965-a772-4ba1e7a91eaa,7002bf88-1269-4965-a772-4ba1e7a91eaa,5fee3020-513b-48c2-b1f7-4681b01db0c6,d700b3f5-45af-4d02-95ed-57d301bda93e,fa927f59-d443-418a-b741-e557208aaf09,6cbe1e63-5895-4168-ac7e-f0d2836ba0c1,061c4920-3ea6-4835-98f6-02f3b82f5e3a,020bfbb4-05c3-4c86-b372-17825c262094,95e1ead9-4d31-4808-a7ac-32c3614c116b,298909e4-ebcb-47b8-95e9-cc53b087fc65,b9472588-93f3-4922-a1a2-74082cdf9ce8,...,cc0b7089-c08d-4c10-b6b0-873582c17fd6,f3b8e107-abe8-4743-b6a3-4a4ee995e71f,42a8f507-8412-4611-854f-926571049fa0,d5b637f0-74b8-4df6-bddf-dc3784a6881a,a5ee1ebe-a645-45d2-8319-d101fe62e581,0efe858c-89e5-4e47-906a-356fa953fd6e,3d2b98e5-556f-4451-a3ff-c50ea18d57cb,8262d8e4-9137-4bb3-a787-3caabbbc13e9,2ba31cc4-df9e-4c21-9dbc-bb8fa9af424f,adc0f033-95c2-4e0b-87bc-c23ed3f26ce6
dbf7c761-e332-467b-b4d9-aafe06bbcf8f,dbf7c761-e332-467b-b4d9-aafe06bbcf8f,7113aab7-628f-4050-ae49-dbecac110ca8,04cd0cfd-bfd1-4c36-bc38-95c35e2c045f,d4ad0149-d8ae-4105-8009-0221fce9ff35,215c6ab2-7888-4061-bd56-9fb650328106,6e0c7c0e-cba5-4c2c-a652-38f71ef5785d,e01646f2-2a04-450d-8bf2-0d993082e058,9beb62b2-88db-4cea-801e-162cd344ee53,8925a01a-8608-411d-a90b-f3a52d061208,42edf21b-2dda-4382-aa6b-49b6483bcf73,...,a9100753-f539-43cf-bcc9-579566fb512e,779353f3-6401-4cda-a8a2-6fd3ec9bc11b,1d11e2a1-4531-4d61-a8c7-7b5c6a608fd2,9854d99f-d954-4c0f-9ae5-58a0cdd885f5,44a7f7d5-79df-4f44-8f5e-02dfc0626904,e940d7a3-01d0-468c-86ea-5dc4d89dcf80,42c14c80-bd17-47e4-9bb2-a35897638c4d,5ecc3f72-20a6-47a0-8dc5-fb0b3dadeea0,5f6ab597-f57a-40da-be9e-adad48708203,b6b2bb8d-54a9-491f-9607-7b546023b433


In [73]:
def findNNPlays(user,artist):
    play=0
    i=0
    artistNN = final_model.loc[artist]
    user_artist = train_data[user].keys()
    while play == 0 and i<30:
        
        if artistNN[i] in user_artist:
            play=train_data[user][artistNN[i]]
            
        else:
            i +=1
    return play       

In [74]:
findNNPlays('eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03','5a8e07d5-d932-4484-a7f7-e700793a9c94')

554

In [75]:
test_data['play'] = test_data.apply(lambda r: findNNPlays(r['user'],r['artist']),axis=1)

In [76]:
test_data.head()

Unnamed: 0,Id,user,artist,play,plays
0,1,306e19cce2522fa2d39ff5dfc870992100ec22d2,4ac4e32b-bd18-402e-adad-ae00e72f8d85,0,72.5
1,2,9450d351278df4938bdea4ed86aec940a4e927ac,1f574ab1-a46d-4586-9331-f0ded23e0411,0,143.5
2,3,801909d6955f59033c88595d3d7f8a6a5dcd53cc,3eb72791-6322-466b-87d3-24d74901eb2d,0,207.0
3,4,e3ed47445c127fbeff47fb58f6bbf2f3b4535d82,61604b45-8a91-4e33-a1b6-45d7b1fec4e5,0,261.0
4,5,a73f46652103f3a5f7429159310f6928f79644aa,5dfdca28-9ddc-4853-933c-8bc97d87beec,0,147.0


In [77]:
def replacezero(row):
    if row['play']==0:
        return user_medians[row['user']]
    else:
        return row['play']

test_data['plays'] = test_data.apply(lambda r: replacezero(r),axis=1)

In [78]:
test_data[['Id','plays']].to_csv('nn30_2.csv',index=False)

In [64]:
a =[1,2]

b=[3,4,2]
a in b

False

## standardized the # of plays

In [72]:
user_total_df = train_data.groupby('user').sum()
user_total=user_total_df.to_dict()

In [65]:
#user_total =user_total.reset_index()
user_total['plays']['f283c15ed4180e686384dc1de2a5cbf5f95ae269']
train_data['proportion'] = train_data.apply(lambda x: x.plays/np.float(user_total['plays'][x.user]),axis=1)

In [66]:
train_data.head()

Unnamed: 0,user,artist,plays,proportion
0,eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03,5a8e07d5-d932-4484-a7f7-e700793a9c94,554,0.034378
1,44ce793a6cd9d20f13f4a576a818ef983314bb5d,a3a92047-be1c-4f3e-8960-c4f8570984df,81,0.142857
2,da9cf3f557161d54b76f24db64be9cc76db008e3,eeb1195b-f213-4ce1-b28c-8565211f8e43,708,0.152883
3,8fa49ab25d425edcf05d44bfc1d5aea895287d81,a1419808-65d3-4d40-998c-1a0bac65eabc,265,0.044329
4,b85fcaef67d2669cd99b334b5e8c8705263db2cf,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,220,0.141753


In [52]:
user_total.shape

(233286, 1)

In [90]:
user_df=user_total_df.reset_index()
user_df=user_df.rename(columns = {'plays':'total_plays'})
user_df.drop('proportion', axis=1, inplace=True)

tmp_data = pd.merge(train_data,profile_data,on='user')
tmp_data2 = pd.merge(tmp_data, user_df,on='user')
main_data = pd.merge(tmp_data2,artist_data,on='artist')


In [91]:
#main_data['gender'] = [1 if main_data['sex'] == "f" else 0]
main_data.head()

Unnamed: 0,user,artist,plays,proportion,sex,age,country,female,countryidx,total_plays,name
0,eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03,5a8e07d5-d932-4484-a7f7-e700793a9c94,554,0.034378,m,25,Sweden,0,0,16115,Robyn
1,0ff4166398f035b5fcb8824cc16c8daeb4643911,5a8e07d5-d932-4484-a7f7-e700793a9c94,169,0.082359,f,18,United Kingdom,1,16,2052,Robyn
2,b3f9fa56429c3b7fd348c471452e65747ba9ed50,5a8e07d5-d932-4484-a7f7-e700793a9c94,292,0.009833,m,23,United Kingdom,0,16,29697,Robyn
3,0ffff52af79555e8fe72289c429b2fdfc8ea684b,5a8e07d5-d932-4484-a7f7-e700793a9c94,92,0.012273,m,26,Germany,0,3,7496,Robyn
4,985253be0dc82ffa15a0ad006d0284aa4b7d1e3d,5a8e07d5-d932-4484-a7f7-e700793a9c94,159,0.011976,m,19,Sweden,0,0,13276,Robyn


In [98]:
main_data.to_csv("maindf.csv")

In [29]:
country_df = main_data[['countryidx']]
y_df= main_data[['plays'],['proportion'],['total_plays']]
y_set = y_df.as_matrix()
gender_set = main_data[['female']].as_matrix()

In [92]:
y_prob = main_data[['proportion']].as_matrix()

In [30]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
cindex = np.matrix(np.arange(239))

enc.fit(cindex.T)
country_set= enc.transform(country_df).toarray()


In [32]:
train_set = np.hstack((country_set,gender_set))

In [93]:
random_state= 123
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(train_set.shape[0]), train_size=0.1,random_state=random_state)
mask=np.ones(train_set.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

X_train = train_set[mask]
Y_train = y_prob[mask]

In [34]:
from sklearn.decomposition import PCA

pca=PCA(n_components=10)
pca.fit_transform(X_train)
pca.explained_variance_ratio_

array([ 0.18251138,  0.15761538,  0.08477288,  0.06648769,  0.04249111,
        0.03720076,  0.03357104,  0.03013427,  0.02924813,  0.02724646])

In [46]:
np.sum(pca.explained_variance_ratio_)

0.69127910657610625

In [35]:
pca_train=pca.fit_transform(X_train)

In [99]:
from sklearn.cluster import KMeans
class_pred = KMeans(n_clusters=10, random_state=random_state).fit_predict(pca_train)


In [102]:
class_1 = (class_pred==0)
class_2 = Y_train[class_pred ==1]
class_3 = Y_train[class_pred ==2 ]

In [101]:
print "class 1:", np.mean(class_1),np.std(class_1)
print "variance:", np.mean(class_2),np.std(class_2)
print "variance:" ,np.mean(class_3),np.std(class_3)

class 1: 0.0615278466692 0.0596720147168
variance: 0.0567964537502 0.0567446462991
variance: 0.0570431303064 0.0554216098786


In [103]:
train=main_data[mask]

In [171]:
def classtoplay(class_k, df,artist_list):
    cluster = df[mask]
    artist_count=dict(zip(artist_list,np.zeros(len(artist_list))))
    artist_prop = dict(zip(artist_list,np.zeros(len(artist_list))))
    artist_dist = dict(zip(artist_list,np.zeros(len(artist_list))))
    
    artist_include =set([])
    for index,row in cluster.iterrows():
        artist_count[row['name']] += 1
        artist_prop[row['name']] += row['proportion']
        artist_include.add(row['name'])

    total_prob=0    
    for artist in artist_include:
        artist_dist[artist] = artist_prop[artist] / np.float(artist_count[artist])
        total_prob += artist_dist[artist]
    
    for artist in artist_include:
         artist_dist[artist] = artist_dist[artist] /total_prob
    
    
    return artist_dist


In [172]:
type(train.name.values)

numpy.ndarray

In [173]:
artist_count=dict(zip(artist_list,np.zeros(len(artist_list))))

In [174]:
artist_count2=dict(zip(artist_list,np.ones(len(artist_list))))
np.array(artist_count2.values()) - np.array(artist_count.values())

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [175]:
classtoplay(class_1, main_data,artist_list)

{nan: 0.00043636253507237061,
 'Gigi D\xe2\x80\x99Agostino': 0.00047896986283453926,
 'Queens of the Stone Age': 0.00053126002468739826,
 'Deerhunter': 0.00057538479803613985,
 'Neil Young & Crazy Horse': 0.00034528834046350647,
 'Bo Kaspers orkester': 0.00061915781175633712,
 'Sondre Lerche': 0.00048899477243557587,
 'Nirvana': 0.0004662697884213433,
 'Massive Attack': 0.00051038305185167031,
 'Poison the Well': 0.00040238027215657254,
 'Billie the Vision & The Dancers': 0.00060414350072927924,
 'Goldfrapp': 0.00045957785517809736,
 'Bullet for My Valentine': 0.0005436647612305478,
 'De-Phazz': 0.0006008797410586808,
 'Gustavo Santaolalla': 0.00055428884492492473,
 'Sublime': 0.00054196582687029049,
 'a-ha': 0.00051473231011576494,
 'Burzum': 0.00053135686190069987,
 'Billy Bragg': 0.0004653850036450044,
 'The Crystal Method': 0.00051843233547380627,
 'PMMP': 0.00066879174267239511,
 'Twisted Sister': 0.00040722047240733137,
 'Astor Piazzolla': 0.00049078198095452272,
 'Max\xc3\xafmo 

In [None]:
for index,row in train.head(3).iterrows():
    print row.name

In [169]:
artist_include =set([])
artist_include.add('a')

In [170]:
artist_include

{'a'}

In [216]:
result = pd.read_csv('global_median.csv')

In [224]:

result['plays'] = [np.float(i[3:-2]) for i in result['plays']]

In [230]:
result.to_csv('k-means.csv',index=False)

In [228]:
np.sum(result['plays']==0)


2227294

Unnamed: 0,Id,plays
0,1,0.0
1,2,0.0
2,3,0.0
3,4,0.053362
4,5,0.0


In [56]:
233286*2000

466572000

In [57]:
len(train_data)

233286

In [60]:
4154804/466572000.0

0.008904957862880756