In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/otto-recommender-system/sample_submission.csv
/kaggle/input/otto-recommender-system/test.jsonl
/kaggle/input/otto-recommender-system/train.jsonl


In [2]:
#train = pd.read_json('/kaggle/input/otto-recommender-system/train.jsonl',lines=True)
test = pd.read_json('/kaggle/input/otto-recommender-system/test.jsonl',lines=True)
test.head()

Unnamed: 0,session,events
0,12899779,"[{'aid': 59625, 'ts': 1661724000278, 'type': '..."
1,12899780,"[{'aid': 1142000, 'ts': 1661724000378, 'type':..."
2,12899781,"[{'aid': 141736, 'ts': 1661724000559, 'type': ..."
3,12899782,"[{'aid': 1669402, 'ts': 1661724000568, 'type':..."
4,12899783,"[{'aid': 255297, 'ts': 1661724000572, 'type': ..."


In [3]:
# convert the data format into dataframe which is easy to handle
type_confer={"clicks":0,"carts":1,"orders":2}
session_inform = []
for i in test.index:
    session = test.loc[i, "session"]
    events = test.loc[i, "events"]
    for event in events:
        session_inform.append([session,event["aid"],event["ts"],type_confer[event["type"]]])
test_df=pd.DataFrame(session_inform,columns=['session','aid','ts','type'])

In [4]:
np.save("./test_df.npy",test_df)
print(test_df.shape)

(6928123, 4)


In [5]:
test_df.head()

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000278,0
1,12899780,1142000,1661724000378,0
2,12899780,582732,1661724058352,0
3,12899780,973453,1661724109199,0
4,12899780,736515,1661724136868,0


In [6]:
from tqdm import tqdm
from collections import defaultdict
import math
from operator import itemgetter
import numpy as np

In [7]:
def itemCFTrain(df):
    
    # create list for dict
    user_item_list = []
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        user = int(row['session'])
        item = int(row['aid'])
        user_item_list.append([user, item])
    
    # create dict
    user_item_dict = dict()
    for user, item in tqdm(user_item_list):
         # change set() to list()
        user_item_dict.setdefault(user, list())
        user_item_dict[user].append(item)
    
    return user_item_dict

In [8]:
def ItemMatrix_fn(user_item_dict):
    
    # use python list to store sparse matrices
    # N[i] represents the number of actions (including licks, carts and orders)
    # conducted on item i (aid i)
    N = defaultdict(int)
    itemMatrix = defaultdict(int)
    for user, items in tqdm(user_item_dict.items()):
        for i in items:
            itemMatrix.setdefault(i, dict())
            N[i] += 1
            for j in items:
                itemMatrix[i].setdefault(j, 0)
                # if item (aid) i and j merge at the same time，
                # increase itemMatrix[i][j] by 1, otherwise it is not stored
                itemMatrix[i][j] += 1
    
    return itemMatrix, N

In [9]:
def ItemSimilarityMatrix_fn(ItemMatrix, N):
    
    itemSimMatrix = defaultdict(int)
    # cosine similarity
    for i, related_items in tqdm(ItemMatrix.items()):
        itemSimMatrix.setdefault(i, dict())
        for j, cij in related_items.items():
            itemSimMatrix[i].setdefault(j, 0)
            itemSimMatrix[i][j] = cij / math.sqrt(N[i] * N[j])
    
    # normalization
    for i, relations in tqdm(itemSimMatrix.items()):
        max_num = relations[max(relations, key=relations.get)]
        if max_num == 0:
            continue
        itemSimMatrix[i] = {k : v / max_num for k, v in relations.items()}
    
    return itemSimMatrix

In [10]:
uidict = itemCFTrain(test_df)
itemMatrix, N = ItemMatrix_fn(uidict)
itemSimMatrix = ItemSimilarityMatrix_fn(itemMatrix, N)
# in case of limited RAM, save intermediate results to the disk
np.save("./uidict.npy",uidict)
np.save("./itemMatrix.npy",itemMatrix)
np.save("./N.npy",N)
np.save("./itemSimMatrix.npy",itemSimMatrix)

100%|██████████| 6928123/6928123 [06:39<00:00, 17327.84it/s]
100%|██████████| 6928123/6928123 [00:08<00:00, 780838.27it/s] 
100%|██████████| 1671803/1671803 [01:13<00:00, 22881.90it/s]
100%|██████████| 783486/783486 [00:51<00:00, 15079.36it/s]
100%|██████████| 783486/783486 [00:14<00:00, 53137.51it/s] 


In [11]:
def recommend(trainData, itemSimMatrix, user, popularity):

    recommends = dict()
    items = trainData[user]
    for item in items:
        # for every item in session, get top 100 similarity scores
        for i, sim in sorted(itemSimMatrix[item].items(), key=itemgetter(1), reverse=True)[:100]:
            recommends.setdefault(i, 0.)
            recommends[i] += sim
    # sort and return top 20
    result = list(dict(sorted(recommends.items(), key=itemgetter(1), reverse=True)[:20]).keys())
    if len(result) < 20:
        result = result + popularity # if num of items < 20, use most popular items overall
        result = result[:20]
    
    return result

In [12]:
uidict = np.load('./uidict.npy', allow_pickle='TRUE')
itemMatrix = np.load('./itemMatrix.npy', allow_pickle='TRUE')
N = np.load('./N.npy', allow_pickle='TRUE')
itemSimMatrix = np.load('./itemSimMatrix.npy', allow_pickle='TRUE')

In [13]:
uidict=uidict.item()
itemMatrix=itemMatrix.item()
N=N.item()
itemSimMatrix=itemSimMatrix.item()

In [14]:
# pop is a list of most popular items (in terms of aids) overall
pop = list(dict(sorted(N.items(), key=itemgetter(1), reverse=True)[:20]).keys())
# users is the IDs of test session
users = list(uidict.keys())
re_items = []

for user in tqdm(users):
    result = list(map(str,recommend(uidict, itemSimMatrix, user, pop)))
    re_items.append(" ".join(result))
np.save("./re_items.npy",re_items)

100%|██████████| 1671803/1671803 [26:24<00:00, 1055.03it/s]


In [15]:
sub_types = list(np.tile(['_clicks', '_carts', '_orders'], len(users)))
sub_users = list(map(str, list(np.repeat(users, 3))))
sub_sessions = [i + j for i, j in zip(sub_users, sub_types)]
sub_items = list(np.repeat(re_items, 3))

sub = pd.DataFrame({
    'session_type': sub_sessions,
    'labels': sub_items
})

In [16]:
sub.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 485256 1460571 108125 986164 554660 1551...
1,12899779_carts,59625 485256 1460571 108125 986164 554660 1551...
2,12899779_orders,59625 485256 1460571 108125 986164 554660 1551...
3,12899780_clicks,1142000 582732 736515 973453 1777259 541564 16...
4,12899780_carts,1142000 582732 736515 973453 1777259 541564 16...


In [17]:
sub.to_csv('submission.csv', index=False)