In [1]:
import numpy as np
import csv
import time
from sklearn.neighbors import NearestNeighbors

In [2]:
st = time.time()

train_file = 'train.csv'
test_file  = 'test.csv'
soln_file  = 'user_medianNN20.csv'
profiles_file = 'profiles.csv'
artists_file = 'artists.csv'

In [3]:
# Load list of users
users = []
with open(profiles_file, 'r') as profiles_fh:
    profiles_csv = csv.reader(profiles_fh, delimiter=',', quotechar='"')
    next(profiles_csv, None)
    for row in profiles_csv:
        user = row[0]
        users.append(user)

In [4]:
# Load list of artists
artists = []
with open(artists_file, 'r') as profiles_fh:
    artists_csv = csv.reader(profiles_fh, delimiter=',', quotechar='"')
    next(artists_csv, None)
    for row in artists_csv:
        artist = row[0]
        artists.append(artist)


In [5]:
# Load the training data.
train_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]

        if not user in train_data:
            train_data[user] = {}

        train_data[user][artist] = int(plays)

end = time.time()
print 'Time to load %d' % (end -st)

Time to load 36


In [6]:
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

In [7]:
def get_user_plays(user, artist):
    if user in train_data:
        for a, p in train_data[user].iteritems():
            if artist == a:
                return p
    return 0.0

In [8]:
# compute X
X = np.zeros((len(users), len(artist)))

for i in range(len(users)):
    for j in range(len(artist)):
        X[i][j] = get_user_plays(users[i], artists[j])
    if i % 10000 == 0:
        print i

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000


In [10]:
print 'Running NearestNeighbors.fit ... '
st = time.time()
neigh = NearestNeighbors(n_neighbors=20)
neigh.fit(X)
end = time.time()

print 'Time to compute fit KNN %d' % (end -st)

Running NearestNeighbors.fit ... 
Time to compute fit KNN 503


In [11]:
def nearest_neighbors(user):
    return neigh.kneighbors(user, return_distance=False)


def user_index(user):
    try:
        return users.index(user)
    except ValueError:
        return None

In [13]:
# returns recommendation for a particular user as mean recommendation of its closest neighbors
def get_recommendations(user):
    user_id = user_index(user)
    if user_id is None:
        return global_median

    x = X[user_id]
    neighbors = nearest_neighbors(x)[0]
    if neighbors is not None:
        # recco = 0.
        reccos = []
        for neighbor in neighbors:
            if users[neighbor] in user_medians:
                r = user_medians[users[neighbor]]
                reccos.append(r)
                # recco += r
            else:
                reccos.append(global_median)
                # recco += global_median
        # final = recco / len(neighbors)
        final = np.median(np.array(reccos))
        return final

    return global_median

In [14]:
line = 0
# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            line += 1
            if line % 1000 == 0:
                print line, row

            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_medians:
                soln_csv.writerow([id, get_recommendations(user)])
                # soln_csv.writerow([id, topMatches(train_data, user, artist)])
                # soln_csv.writerow([id, user_medians[user]])
                # print getRecommendations(train_data, user)[0]
            else:
                soln_csv.writerow([id, global_median])

1000 ['1000', '230404d6297b84b390b982473ead2fb06b2297ea', '2c1828a2-e46b-4ade-a6e9-065ca265d679']
2000 ['2000', 'cc0083cc31d8e82889c353ba960c841e1e1b9711', '3caf69c1-fdd8-44bc-b0a5-8af3ad8a47cd']
3000 ['3000', 'f47ec03fca0651b38a60793c93e7cca0a267c96a', '9a58fda3-f4ed-4080-a3a5-f457aac9fcdd']
4000 ['4000', '9ddec22b7ce2e88f48170f30d32c7b6d93108754', '78e46ae5-9bfd-433b-be3f-19e993d67ecc']
5000 ['5000', 'c73e3d1bc128fb8380858e185adeb2978bd8408a', 'f96de591-5dae-41da-855b-eb387d88d7d4']
6000 ['6000', '27b97869f66866ffa2a8671534a5fbbaff40247f', 'e938a15c-b17e-4e7a-9f68-ff0d536cab44']
7000 ['7000', '8d1128586e127d244782666ef05b1bfb2348fbe9', 'ac2d1c91-3667-46aa-9fe7-170ca7fce9e2']
8000 ['8000', '6aff085e3027ae8f64a4dd0818f1f98a001f50a3', 'af37c51c-0790-4a29-b995-456f98a6b8c9']
9000 ['9000', '561f338040dea5b2b1a727efe9459c0b154e455d', '4e0dffde-ad2d-45b7-9c75-d57ce55de061']
10000 ['10000', 'd76f9eb0cdfbade3376a1ed0d8b304448c9bd00c', 'daa09819-5da5-4c7a-8bef-eb372bb27ff1']
11000 ['11000', 'b

KeyboardInterrupt: 