In [1]:
import pandas as pd
import numpy as np
import scipy as sc
import sklearn
from matplotlib import pyplot as plt
%matplotlib inline
from collections import OrderedDict

In [2]:
def str2list(string):
    if len(string) == 0:
        return []
    try: 
        return list(map(int, string.split(',')))
    except ValueError as e:
        print('string: ', string)
        

In [6]:
train_data = pd.read_table('coursera_sessions_train.txt', header=None, sep=';', converters={0:str2list, 1:str2list})
test_data = pd.read_table('coursera_sessions_test.txt', header=None, sep=';', converters={0:str2list, 1:str2list})

In [7]:
train_data.columns =['see', 'buy']
test_data.columns =['see', 'buy']

In [8]:
train_data.head()

Unnamed: 0,see,buy
0,"[0, 1, 2, 3, 4, 5]",[]
1,"[9, 10, 11, 9, 11, 12, 9, 11]",[]
2,"[16, 17, 18, 19, 20, 21]",[]
3,"[24, 25, 26, 27, 24]",[]
4,"[34, 35, 36, 34, 37, 35, 36, 37, 38, 39, 38, 39]",[]


In [9]:
train_data.shape

(50000, 2)

In [10]:
test_data.shape

(50000, 2)

In [11]:
test_data.head()

Unnamed: 0,see,buy
0,"[6, 7, 8]",[]
1,"[13, 14, 15]",[]
2,"[22, 23]",[]
3,"[28, 29, 30, 31, 32, 33]",[]
4,"[40, 41]",[]


In [12]:
buy_dict_train = OrderedDict()

for ind in train_data.index:
    raw = train_data.loc[ind]
            
    buy_l_values = raw.buy
    for val in buy_l_values:
        if val in buy_dict_train.keys():
            buy_dict_train[val] += 1
        else:
            buy_dict_train[val] = 1


In [13]:
buy_dict_train

OrderedDict([(67, 1),
             (60, 1),
             (63, 1),
             (86, 2),
             (199, 1),
             (303, 1),
             (352, 2),
             (519, 4),
             (603, 1),
             (604, 1),
             (602, 2),
             (599, 1),
             (605, 6),
             (606, 2),
             (600, 1),
             (690, 1),
             (688, 1),
             (851, 1),
             (879, 1),
             (1118, 3),
             (1545, 1),
             (1727, 1),
             (99, 1),
             (1907, 1),
             (1959, 1),
             (1998, 2),
             (2013, 2),
             (2019, 1),
             (2462, 1),
             (2520, 1),
             (2543, 1),
             (1526, 1),
             (2764, 1),
             (2857, 1),
             (2853, 1),
             (2852, 1),
             (2920, 1),
             (2930, 2),
             (3033, 2),
             (3026, 2),
             (3032, 1),
             (3031, 1),
             (310

In [25]:
def precision(raw, sorted_dict_train, k):
    
    see = raw.see
    see = set(raw.see)
    buy = raw.buy
    
    if len(buy) == 0:
        return 0
    
    see = [s for s in see if s in sorted_dict_train.keys()]
    if len(see) == 0:
        return 0
    
    see = sorted(see, key=lambda x: sorted_dict_train[x], reverse=True)
    recommended = see[:k]    
    bough_recommended = [b for b in buy if b in recommended]    
    return len(bough_recommended)/k

In [26]:
def recall(raw, sorted_dict_train, k):
    see = raw.see
    see = set(see)
    buy = raw.buy
    
    if len(buy) == 0:
        return 0

    see = [s for s in see if s in sorted_dict_train.keys()]
    
    if len(see) == 0:
        return 0
    see = sorted(see, key=lambda x: sorted_dict_train[x], reverse=True)
    recommended = see[:k]
    bough_recommended = [b for b in buy if b in recommended]    
    return len(bough_recommended)/len(buy)

In [27]:
precision_buy_test_1 = 0
precision_buy_test_5 = 0

recall_buy_test_1 = 0
recall_buy_test_5 = 0

num_good_session_buy_test = 0


for indx in test_data.index:
    raw = test_data.loc[indx]
    if len(raw.buy) == 0:
        continue
        
    res_prec_1 = precision(raw, buy_dict_train, 1)
    if res_prec_1 is not None:
        num_good_session_buy_test += 1
        precision_buy_test_1 += res_prec_1
        precision_buy_test_5 += precision(raw, buy_dict_train, 5)

        recall_buy_test_1 += recall(raw, buy_dict_train, 1)
        recall_buy_test_5 += recall(raw, buy_dict_train, 5)


In [28]:
print(num_good_session_buy_test)

3665


In [29]:
precision_buy_test_1 /= num_good_session_buy_test
precision_buy_test_5 /= num_good_session_buy_test

recall_buy_test_1 /= num_good_session_buy_test
recall_buy_test_5 /= num_good_session_buy_test

In [30]:
print("recall_buy_test_1\t", recall_buy_test_1)
print("precision_buy_test_1\t", precision_buy_test_1)

print("recall_buy_test_5\t", recall_buy_test_5)
print("precision_buy_test_5\t", precision_buy_test_5)

recall_buy_test_1	 0.17052645523875762
precision_buy_test_1	 0.2111869031377899
recall_buy_test_5	 0.2374604907662786
precision_buy_test_5	 0.0648294679399717


In [71]:
# with open('answr2-4.txt', 'w') as f:
#    f.write("{0} {1} {2} {3}".format(0.43, 0.54, 0.76, 0.19))