In [2]:
import numpy as np
import pandas as pd

import re

## Load data

In [2]:
# Maximum number of lines to read in a file
limit = 10000000

### Load data in dictionnaries

In [3]:
# Create dictionnary that will represent users x items interactions {user_id:, item_id:, click: }
data_dict = {'user':[], 
            'item_id':[],
            'click':[],
            'file':[], 
            'timestamp':[]}

# Features of items
items_feat = {}

# Ids of files
ids = np.arange(1, 11)

# Keep track users and items
items = set()
users = set()
timestamps = set()

for id in ids:
    ## Open data file
    id = f'0{id}' if id < 10 else str(id)
    filename = f'ydata-fp-td-clicks-v1_0.200905{id}'
    file = open(f'./{filename}', 'r')
    print(filename)

    ## Go through each line
    count = 0               # Line number
    for line in file:
        count += 1
        line = line.strip()

        x = re.findall('\d+', line)
        ## Get timestamp
        timestamp = int(x[0])
        if not timestamp in timestamps:
            timestamps.add(timestamp)

        ## Get item id 
        item_id = int(x[1])
        if not item_id in items:
            items.add(item_id)

        ## Get click
        click = int(x[2])

        ## Get user features
        x = re.findall('\|user [ .:\d]+\|', line)
        user = x[0][6:-2]
        if not user in users:
            users.add(user)


        ## Get item features
        x = re.findall(f'\|{item_id} [ .:\d]+\|*', line)

        item_feat = x[0][len(str(item_id))+2:-2]
        item_feat = item_feat.split(' ')
        item_feat = np.array(list(map(lambda x: float(x[2:]), item_feat)))


        ## Add to res
        data_dict['user'].append(user)
        data_dict['item_id'].append(item_id)
        data_dict['click'].append(click)
        data_dict['file'].append(id)
        data_dict['timestamp'].append(timestamp)


        ## Add feat items 
        if item_id not in items_feat:
            items_feat[item_id] = item_feat


        if count == limit:
            break
    
    # Closing files
    file.close()

ydata-fp-td-clicks-v1_0.20090501
ydata-fp-td-clicks-v1_0.20090502
ydata-fp-td-clicks-v1_0.20090503
ydata-fp-td-clicks-v1_0.20090504
ydata-fp-td-clicks-v1_0.20090505
ydata-fp-td-clicks-v1_0.20090506
ydata-fp-td-clicks-v1_0.20090507
ydata-fp-td-clicks-v1_0.20090508
ydata-fp-td-clicks-v1_0.20090509
ydata-fp-td-clicks-v1_0.20090510


In [4]:
print(f'Number of items: {len(items)}')
print(f'Number of users: {len(users)}')

Number of items: 271
Number of users: 29849370


### Transform data to dataframes

In [5]:
data_df = pd.DataFrame.from_dict(data_dict)
data_df

Unnamed: 0,user,item_id,click,file,timestamp
0,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109513,0,01,1241160900
1,2:0.088932 3:0.003915 4:0.592269 5:0.314084 6:...,109484,0,01,1241160900
2,2:0.000293 3:0.000002 4:0.000411 5:0.000770 6:...,109495,0,01,1241160900
3,2:0.011889 3:0.863387 4:0.000385 5:0.124338 6:...,109494,0,01,1241160900
4,2:0.000054 3:0.000002 4:0.000022 5:0.000087 6:...,109492,0,01,1241160900
...,...,...,...,...,...
45811878,2:0.002133 3:0.000129 4:0.000469 5:0.003353 6:...,109762,0,10,1242024900
45811879,2:0.000085 3:0.000001 4:0.000070 5:0.000083 6:...,109776,0,10,1242024900
45811880,2:0.215681 3:0.000101 4:0.577916 5:0.002193 6:...,109730,0,10,1242024900
45811881,2:0.267385 3:0.436222 4:0.009811 5:0.283917 6:...,109765,0,10,1242024900


In [6]:
print(f"Number of items: {len(data_df['item_id'].unique())}||{len(items_feat.keys())}")
print(f"Number of users: {len(data_df['user'].unique())}")

Number of items: 271||271
Number of users: 29849370


In [7]:
items_df = pd.DataFrame.from_dict(items_feat, orient='index')
items_df

Unnamed: 0,0,1,2,3,4,5
109513,0.211406,0.000036,0.002773,0.569886,0.215900,1.0
109484,0.438513,0.000003,0.030714,0.384494,0.146277,1.0
109495,0.313277,0.000125,0.018413,0.410555,0.257630,1.0
109494,0.306008,0.000450,0.077048,0.230439,0.386055,1.0
109492,0.331830,0.000022,0.019904,0.440390,0.207855,1.0
...,...,...,...,...,...,...
109782,0.003265,0.000000,0.990059,0.000604,0.006072,1.0
109783,0.306008,0.000450,0.077048,0.230439,0.386055,1.0
109730,0.248121,0.000050,0.399913,0.133608,0.218308,1.0
109784,0.362520,0.000004,0.062754,0.315328,0.259394,1.0


### Save to csv

In [8]:
data_df.to_csv('./data.csv')
items_df.to_csv('./items.csv')

In [1]:
import numpy as np
import pandas as pd


data_df = pd.read_csv('./data.csv', sep=',', index_col=0)
items_df = pd.read_csv('./items.csv', sep=',', index_col=0)

## Process data

In [2]:
data_df_clean = data_df.copy()
data_df_clean.drop_duplicates(keep='first', inplace=True)

In [3]:
data_df_clean = data_df_clean.drop(['file','timestamp'], axis=1, inplace=False)
data_df_clean

Unnamed: 0,user,item_id,click
0,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109513,0
1,2:0.088932 3:0.003915 4:0.592269 5:0.314084 6:...,109484,0
2,2:0.000293 3:0.000002 4:0.000411 5:0.000770 6:...,109495,0
3,2:0.011889 3:0.863387 4:0.000385 5:0.124338 6:...,109494,0
4,2:0.000054 3:0.000002 4:0.000022 5:0.000087 6:...,109492,0
...,...,...,...
45811878,2:0.002133 3:0.000129 4:0.000469 5:0.003353 6:...,109762,0
45811879,2:0.000085 3:0.000001 4:0.000070 5:0.000083 6:...,109776,0
45811880,2:0.215681 3:0.000101 4:0.577916 5:0.002193 6:...,109730,0
45811881,2:0.267385 3:0.436222 4:0.009811 5:0.283917 6:...,109765,0


### Count how many interactions a user had

In [4]:
count_interactions = data_df_clean.groupby(['user']).size()
count_interactions = count_interactions.to_frame(name='counts')
count_interactions

Unnamed: 0_level_0,counts
user,Unnamed: 1_level_1
2:0.000000 3:0.000000 4:0.000000 5:0.000000 6:0.999999 1:1.000000,163
2:0.000000 3:0.000000 4:0.000000 5:0.000000 6:1.000000 1:1.000000,43
2:0.000000 3:0.000000 4:0.000000 5:0.000001 6:0.999998 1:1.000000,76
2:0.000000 3:0.000000 4:0.000000 5:0.000001 6:0.999999 1:1.000000,109
2:0.000000 3:0.000000 4:0.000000 5:0.000002 6:0.999997 1:1.000000,24
...,...
2:0.977913 3:0.006624 4:0.011898 5:0.001936 6:0.001629 1:1.000000,1
2:0.978056 3:0.010799 4:0.008819 5:0.000829 6:0.001497 1:1.000000,2
2:0.979268 3:0.007808 4:0.006347 5:0.001979 6:0.004597 1:1.000000,1
2:0.979373 3:0.009098 4:0.010301 5:0.000604 6:0.000625 1:1.000000,1


In [5]:
np.min(count_interactions['counts']), np.max(count_interactions['counts']), np.median(count_interactions['counts']), np.mean(count_interactions['counts'])

(1, 86437, 1.0, 1.3951068984035508)

### Compute ratio of clicks given a user and item

In [6]:
ratio = data_df_clean.groupby(['user', 'item_id']).aggregate('mean')
ratio

Unnamed: 0_level_0,Unnamed: 1_level_0,click
user,item_id,Unnamed: 2_level_1
2:0.000000 3:0.000000 4:0.000000 5:0.000000 6:0.999999 1:1.000000,109476,0.0
2:0.000000 3:0.000000 4:0.000000 5:0.000000 6:0.999999 1:1.000000,109484,0.0
2:0.000000 3:0.000000 4:0.000000 5:0.000000 6:0.999999 1:1.000000,109492,0.0
2:0.000000 3:0.000000 4:0.000000 5:0.000000 6:0.999999 1:1.000000,109494,0.0
2:0.000000 3:0.000000 4:0.000000 5:0.000000 6:0.999999 1:1.000000,109495,0.0
...,...,...
2:0.978056 3:0.010799 4:0.008819 5:0.000829 6:0.001497 1:1.000000,109771,0.0
2:0.978056 3:0.010799 4:0.008819 5:0.000829 6:0.001497 1:1.000000,109778,0.0
2:0.979268 3:0.007808 4:0.006347 5:0.001979 6:0.004597 1:1.000000,109518,0.0
2:0.979373 3:0.009098 4:0.010301 5:0.000604 6:0.000625 1:1.000000,109747,0.0


In [7]:
data_df_clean.drop('click', axis=1, inplace=True)
data_df_clean.drop_duplicates(inplace=True)
data_df_clean

Unnamed: 0,user,item_id
0,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109513
1,2:0.088932 3:0.003915 4:0.592269 5:0.314084 6:...,109484
2,2:0.000293 3:0.000002 4:0.000411 5:0.000770 6:...,109495
3,2:0.011889 3:0.863387 4:0.000385 5:0.124338 6:...,109494
4,2:0.000054 3:0.000002 4:0.000022 5:0.000087 6:...,109492
...,...,...
45811877,2:0.001175 3:0.001832 4:0.000004 5:0.000618 6:...,109783
45811879,2:0.000085 3:0.000001 4:0.000070 5:0.000083 6:...,109776
45811880,2:0.215681 3:0.000101 4:0.577916 5:0.002193 6:...,109730
45811881,2:0.267385 3:0.436222 4:0.009811 5:0.283917 6:...,109765


In [8]:
data_df_clean = pd.merge(data_df_clean, ratio, on=['item_id', 'user'])
data_df_clean


Unnamed: 0,user,item_id,click
0,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109513,0.0
1,2:0.088932 3:0.003915 4:0.592269 5:0.314084 6:...,109484,0.0
2,2:0.000293 3:0.000002 4:0.000411 5:0.000770 6:...,109495,0.0
3,2:0.011889 3:0.863387 4:0.000385 5:0.124338 6:...,109494,0.0
4,2:0.000054 3:0.000002 4:0.000022 5:0.000087 6:...,109492,0.0
...,...,...,...
36540343,2:0.001175 3:0.001832 4:0.000004 5:0.000618 6:...,109783,0.0
36540344,2:0.000085 3:0.000001 4:0.000070 5:0.000083 6:...,109776,0.0
36540345,2:0.215681 3:0.000101 4:0.577916 5:0.002193 6:...,109730,0.0
36540346,2:0.267385 3:0.436222 4:0.009811 5:0.283917 6:...,109765,0.0


### Keep users that interracted with more than X items

In [9]:
number_interactions = data_df_clean.groupby('user').aggregate('size')
number_interactions = pd.DataFrame({'user': number_interactions.index, f'number_interactions_user': number_interactions.values})
number_interactions

Unnamed: 0,user,number_interactions_user
0,2:0.000000 3:0.000000 4:0.000000 5:0.000000 6:...,120
1,2:0.000000 3:0.000000 4:0.000000 5:0.000000 6:...,37
2,2:0.000000 3:0.000000 4:0.000000 5:0.000001 6:...,63
3,2:0.000000 3:0.000000 4:0.000000 5:0.000001 6:...,80
4,2:0.000000 3:0.000000 4:0.000000 5:0.000002 6:...,20
...,...,...
29849365,2:0.977913 3:0.006624 4:0.011898 5:0.001936 6:...,1
29849366,2:0.978056 3:0.010799 4:0.008819 5:0.000829 6:...,2
29849367,2:0.979268 3:0.007808 4:0.006347 5:0.001979 6:...,1
29849368,2:0.979373 3:0.009098 4:0.010301 5:0.000604 6:...,1


In [13]:
items_df[items_df.isna().any(axis=1)]

Unnamed: 0,0,1,2,3,4,5
109528,1.0,,,,,


In [12]:
data_df_clean.isnull().sum().sum()

0

In [14]:
limit_interactions = 50
number_interactions2 = number_interactions[(number_interactions['number_interactions_user'] >= limit_interactions) ]
number_interactions2.shape[0]

5788

In [15]:
data_df_clean2 = pd.merge(data_df_clean, number_interactions2, on='user')
data_df_clean2.drop('number_interactions_user', axis=1, inplace=True)
data_df_clean2

Unnamed: 0,user,item_id,click
0,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109513,0.000000
1,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109484,0.000000
2,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109494,0.000000
3,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109509,0.035714
4,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109495,0.153846
...,...,...,...
670730,2:0.438591 3:0.001834 4:0.335858 5:0.027709 6:...,109777,0.000000
670731,2:0.438591 3:0.001834 4:0.335858 5:0.027709 6:...,109775,0.000000
670732,2:0.438591 3:0.001834 4:0.335858 5:0.027709 6:...,109785,0.000000
670733,2:0.438591 3:0.001834 4:0.335858 5:0.027709 6:...,109730,0.000000


### Keep users that clicked on more than 5 articles
To remove users that clicked on nothing (leading to a null user vector when doing linear regression)

In [16]:
clicks = data_df_clean2.groupby('user')['click'].apply(list)
clicks = clicks.apply(lambda x: sum([1 if y != 0 else 0 for y in x ]))
clicks = pd.DataFrame({'user': clicks.index, f'number_clicks': clicks.values})
clicks2 = clicks[clicks['number_clicks'] >= 5]
clicks2

Unnamed: 0,user,number_clicks
239,2:0.000012 3:0.000000 4:0.000005 5:0.000039 6:...,12
240,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,66
315,2:0.000025 3:0.000001 4:0.000015 5:0.000086 6:...,32
323,2:0.000031 3:0.000001 4:0.000016 5:0.000113 6:...,7
330,2:0.000033 3:0.000001 4:0.000009 5:0.000080 6:...,5
...,...,...
5772,2:0.829399 3:0.095866 4:0.009002 5:0.058202 6:...,15
5780,2:0.845572 3:0.065666 4:0.057261 5:0.030409 6:...,13
5781,2:0.847071 3:0.038975 4:0.081922 5:0.030781 6:...,29
5782,2:0.847079 3:0.043321 4:0.068452 5:0.038907 6:...,20


In [17]:
data_df_clean2 = pd.merge(data_df_clean2, clicks2, on='user')
data_df_clean2


Unnamed: 0,user,item_id,click,number_clicks
0,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109513,0.000000,66
1,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109484,0.000000,66
2,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109494,0.000000,66
3,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109509,0.035714,66
4,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109495,0.153846,66
...,...,...,...,...
289047,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109781,0.000000,5
289048,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109780,0.000000,5
289049,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109779,0.000000,5
289050,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109762,0.000000,5


In [18]:
data_df_clean2.drop('number_clicks', axis=1, inplace=True)
data_df_clean2

Unnamed: 0,user,item_id,click
0,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109513,0.000000
1,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109484,0.000000
2,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109494,0.000000
3,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109509,0.035714
4,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109495,0.153846
...,...,...,...
289047,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109781,0.000000
289048,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109780,0.000000
289049,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109779,0.000000
289050,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109762,0.000000


## Create items vectors

In [19]:
items_df

Unnamed: 0,0,1,2,3,4,5
109513,0.211406,0.000036,0.002773,0.569886,0.215900,1.0
109484,0.438513,0.000003,0.030714,0.384494,0.146277,1.0
109495,0.313277,0.000125,0.018413,0.410555,0.257630,1.0
109494,0.306008,0.000450,0.077048,0.230439,0.386055,1.0
109492,0.331830,0.000022,0.019904,0.440390,0.207855,1.0
...,...,...,...,...,...,...
109782,0.003265,0.000000,0.990059,0.000604,0.006072,1.0
109783,0.306008,0.000450,0.077048,0.230439,0.386055,1.0
109730,0.248121,0.000050,0.399913,0.133608,0.218308,1.0
109784,0.362520,0.000004,0.062754,0.315328,0.259394,1.0


### Use given features

In [20]:
items_df.drop('5', axis=1, inplace=True)
items_df.fillna(0, inplace=True)
items_df

Unnamed: 0,0,1,2,3,4
109513,0.211406,0.000036,0.002773,0.569886,0.215900
109484,0.438513,0.000003,0.030714,0.384494,0.146277
109495,0.313277,0.000125,0.018413,0.410555,0.257630
109494,0.306008,0.000450,0.077048,0.230439,0.386055
109492,0.331830,0.000022,0.019904,0.440390,0.207855
...,...,...,...,...,...
109782,0.003265,0.000000,0.990059,0.000604,0.006072
109783,0.306008,0.000450,0.077048,0.230439,0.386055
109730,0.248121,0.000050,0.399913,0.133608,0.218308
109784,0.362520,0.000004,0.062754,0.315328,0.259394


In [21]:
users_x_items = data_df_clean2.join(items_df, on='item_id')
users_x_items

Unnamed: 0,user,item_id,click,0,1,2,3,4
0,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109513,0.000000,0.211406,0.000036,0.002773,0.569886,0.215900
1,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109484,0.000000,0.438513,0.000003,0.030714,0.384494,0.146277
2,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109494,0.000000,0.306008,0.000450,0.077048,0.230439,0.386055
3,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109509,0.035714,0.306008,0.000450,0.077048,0.230439,0.386055
4,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109495,0.153846,0.313277,0.000125,0.018413,0.410555,0.257630
...,...,...,...,...,...,...,...,...
289047,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109781,0.000000,0.306008,0.000450,0.077048,0.230439,0.386055
289048,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109780,0.000000,0.214605,0.000037,0.410493,0.097704,0.277162
289049,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109779,0.000000,0.301198,0.000030,0.019206,0.440773,0.238793
289050,2:0.728608 3:0.124481 4:0.021385 5:0.123906 6:...,109762,0.000000,0.360635,0.000071,0.065585,0.267309,0.306400


In [24]:
users_x_items.isnull().sum().sum()

0

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer

## Create list of users
users_lists = users_x_items.groupby('item_id')['user'].apply(list)

## Merge dataframes
users_x_items = pd.merge(users_x_items, users_lists, on='item_id')

## Users to One hot vectors 
mlb = MultiLabelBinarizer()
users_x_items = users_x_items.join(pd.DataFrame(mlb.fit_transform(users_x_items.pop('user_y')),
                                            columns=mlb.classes_,
                                            index=users_x_items.index))
users_x_items.rename(columns={'user_x':'user'}, inplace=True)
users_x_items

Unnamed: 0,user,item_id,click,0,1,2,3,4,2:0.000012 3:0.000000 4:0.000005 5:0.000039 6:0.999943 1:1.000000,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:0.999958 1:1.000000,...,2:0.816844 3:0.043163 4:0.087708 5:0.050985 6:0.001300 1:1.000000,2:0.819267 3:0.041668 4:0.104004 5:0.034123 6:0.000938 1:1.000000,2:0.821564 3:0.046502 4:0.099740 5:0.030487 6:0.001707 1:1.000000,2:0.826532 3:0.074782 4:0.057998 5:0.039527 6:0.001161 1:1.000000,2:0.828952 3:0.048181 4:0.084698 5:0.036654 6:0.001515 1:1.000000,2:0.829399 3:0.095866 4:0.009002 5:0.058202 6:0.007532 1:1.000000,2:0.845572 3:0.065666 4:0.057261 5:0.030409 6:0.001093 1:1.000000,2:0.847071 3:0.038975 4:0.081922 5:0.030781 6:0.001250 1:1.000000,2:0.847079 3:0.043321 4:0.068452 5:0.038907 6:0.002241 1:1.000000,2:0.850135 3:0.034733 4:0.094029 5:0.020790 6:0.000313 1:1.000000
0,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:...,109513,0.000000,0.211406,0.000036,0.002773,0.569886,0.215900,1,1,...,0,1,1,0,0,1,1,1,1,1
1,2:0.000069 3:0.000002 4:0.000009 5:0.000043 6:...,109513,0.040984,0.211406,0.000036,0.002773,0.569886,0.215900,1,1,...,0,1,1,0,0,1,1,1,1,1
2,2:0.010609 3:0.000910 4:0.008778 5:0.055681 6:...,109513,0.000000,0.211406,0.000036,0.002773,0.569886,0.215900,1,1,...,0,1,1,0,0,1,1,1,1,1
3,2:0.519539 3:0.062237 4:0.058665 5:0.353326 6:...,109513,0.095238,0.211406,0.000036,0.002773,0.569886,0.215900,1,1,...,0,1,1,0,0,1,1,1,1,1
4,2:0.009571 3:0.000864 4:0.001399 5:0.009514 6:...,109513,0.000000,0.211406,0.000036,0.002773,0.569886,0.215900,1,1,...,0,1,1,0,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289047,2:0.304964 3:0.043302 4:0.119951 5:0.528214 6:...,109781,0.000000,0.306008,0.000450,0.077048,0.230439,0.386055,0,0,...,0,0,1,1,0,0,0,1,1,1
289048,2:0.288554 3:0.016062 4:0.241110 5:0.449063 6:...,109781,0.000000,0.306008,0.000450,0.077048,0.230439,0.386055,0,0,...,0,0,1,1,0,0,0,1,1,1
289049,2:0.114025 3:0.859769 4:0.000329 5:0.025691 6:...,109781,1.000000,0.306008,0.000450,0.077048,0.230439,0.386055,0,0,...,0,0,1,1,0,0,0,1,1,1
289050,2:0.176319 3:0.219882 4:0.009174 5:0.574887 6:...,109781,0.000000,0.306008,0.000450,0.077048,0.230439,0.386055,0,0,...,0,0,1,1,0,0,0,1,1,1


In [26]:
users_x_items.sort_values(by='click', inplace=True, ascending=False)
users_x_items
users_x_items.to_csv('./users_x_items.csv')

In [1]:
import pandas as pd 

users_x_items = pd.read_csv('./users_x_items.csv', sep=',', index_col=0)
users_x_items

Unnamed: 0,user,item_id,click,0,1,2,3,4,2:0.000012 3:0.000000 4:0.000005 5:0.000039 6:0.999943 1:1.000000,2:0.000012 3:0.000000 4:0.000006 5:0.000023 6:0.999958 1:1.000000,...,2:0.816844 3:0.043163 4:0.087708 5:0.050985 6:0.001300 1:1.000000,2:0.819267 3:0.041668 4:0.104004 5:0.034123 6:0.000938 1:1.000000,2:0.821564 3:0.046502 4:0.099740 5:0.030487 6:0.001707 1:1.000000,2:0.826532 3:0.074782 4:0.057998 5:0.039527 6:0.001161 1:1.000000,2:0.828952 3:0.048181 4:0.084698 5:0.036654 6:0.001515 1:1.000000,2:0.829399 3:0.095866 4:0.009002 5:0.058202 6:0.007532 1:1.000000,2:0.845572 3:0.065666 4:0.057261 5:0.030409 6:0.001093 1:1.000000,2:0.847071 3:0.038975 4:0.081922 5:0.030781 6:0.001250 1:1.000000,2:0.847079 3:0.043321 4:0.068452 5:0.038907 6:0.002241 1:1.000000,2:0.850135 3:0.034733 4:0.094029 5:0.020790 6:0.000313 1:1.000000
8283,2:0.476853 3:0.035348 4:0.115827 5:0.369916 6:...,109505,1.0,0.375829,0.000025,0.033041,0.349637,0.241468,0,1,...,0,0,1,0,0,0,1,0,0,0
127694,2:0.551496 3:0.330723 4:0.004091 5:0.112697 6:...,109621,1.0,0.409581,0.000001,0.064845,0.368259,0.157314,1,1,...,1,0,1,1,0,0,1,1,0,1
31362,2:0.138834 3:0.803961 4:0.000437 5:0.056016 6:...,109532,1.0,0.346043,0.000003,0.141508,0.260906,0.251541,1,1,...,1,0,1,1,1,1,1,1,0,0
66031,2:0.153315 3:0.034755 4:0.107953 5:0.703485 6:...,109561,1.0,0.037081,0.000006,0.820149,0.016605,0.126160,1,1,...,1,1,1,1,0,1,1,1,1,1
51492,2:0.270165 3:0.101276 4:0.041466 5:0.586364 6:...,109551,1.0,0.116434,0.000000,0.611344,0.077418,0.194803,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105515,2:0.250580 3:0.374573 4:0.008084 5:0.362743 6:...,109598,0.0,0.306008,0.000450,0.077048,0.230439,0.386055,1,1,...,1,0,1,1,1,1,1,1,1,1
105516,2:0.157135 3:0.001763 4:0.759254 5:0.081748 6:...,109598,0.0,0.306008,0.000450,0.077048,0.230439,0.386055,1,1,...,1,0,1,1,1,1,1,1,1,1
105517,2:0.517802 3:0.010548 4:0.359300 5:0.111741 6:...,109598,0.0,0.306008,0.000450,0.077048,0.230439,0.386055,1,1,...,1,0,1,1,1,1,1,1,1,1
105518,2:0.095760 3:0.689493 4:0.001684 5:0.212865 6:...,109598,0.0,0.306008,0.000450,0.077048,0.230439,0.386055,1,1,...,1,0,1,1,1,1,1,1,1,1


In [2]:
users_x_items.isnull().sum().sum()

0

In [3]:
import numpy as np 
from sklearn.linear_model import LinearRegression

def load_yahoo(users_x_items,  seed=12):
    items_vectors = users_x_items.drop(['click', 'user'], axis=1, inplace=False)
    items_vectors.drop_duplicates(inplace=True)

    nb_items_total = len(items_vectors)     # total nb items
    nb_items_lr = 50                        # nb items for linear regression

    ### Get users listp
    rng = np.random.RandomState(seed)
    users = users_x_items['user'].unique()

    ### Choose user
    user_id = rng.choice(users)

    ### Get items already reviewed by user
    items_rated = users_x_items[users_x_items['user'] == user_id]
    items_rated_ids = items_rated['item_id'].values[:nb_items_lr]
    items_ratings = items_rated['click'].values[:nb_items_lr]
    # items_rated = items_vectors[items_vectors['item_id'].isin(items_rated_ids)]
    # items_rated = np.array(items_rated.drop('item_id', axis=1, inplace=False))
    items_rated = np.array(items_rated.drop(['item_id', 'user', 'click',], axis=1, inplace=False))[:nb_items_lr]
    items_rated /= np.linalg.norm(items_rated, 2, axis=1)[:, None]

    ### Compute linear regression to estimate theta 
    reg = LinearRegression(fit_intercept=False).fit(items_rated, items_ratings)
    theta_user = reg.coef_

    ### Create catalog of X items not rated by given user using most rated games
    size_catalog = nb_items_total - nb_items_lr
    items_catalog = items_vectors[~items_vectors['item_id'].isin(items_rated_ids)]
    items_catalog = np.array(items_catalog[:size_catalog].drop('item_id', axis=1))


    # theta_user /= np.linalg.norm(theta_user, 2)
    items_catalog /= np.linalg.norm(items_catalog, 2, axis=1)[:, None]

    return theta_user, items_catalog, size_catalog, theta_user.shape[0]

In [4]:
for i in range(50):
    # print(i)
    a = load_yahoo(users_x_items, seed=i)
    print(sum(a[0]))
    if np.sum(a[0]) == 0:
        print('ici:', i)
        # print(i)

184 184
2.836422163318442
226 226
3.769203933927299
251 251
0.38297105381043545
268 268
4.2693168244272055
215 215
2.398363586126975
260 260
3.0591546812489114
209 209
1.4744894794698462
125 125
3.030751174717599
81 81
4.820072595052667
156 156
1.1654835733924185
271 271
10.4861793830228
192 192
3.1852853086624116
271 271
4.303437667292969
181 181
7.425622322517587
162 162
0.6743136166213033
263 263
1.2180815339426538
209 209
0.8488957894797969
227 227
4.030444755560264
231 231
1.1439090395283114
119 119
1.9687248322486066
271 271
7.581274096037416
172 172
3.4692776538497627
208 208
2.384506791514773
92 92
2.3908060679727465
106 106
5.081894152706544
243 243
1.0354062814220673
82 82
6.2592675963451505
182 182
5.8606733811273966
233 233
1.275929271680615
265 265
3.897700586559153
85 85
0.7377977972913432
225 225
1.067534893313999
173 173
0.9927272286162261
107 107
1.7990067933475542
271 271
2.798563110379083
217 217
0.36065327104864375
206 206
4.463958187441113
116 116
2.480846248680346