### Load Transactions, Reduce Memory  

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

In [64]:
# articles = pd.read_csv("input/articles.csv")
# customers = pd.read_csv("input/customers.csv")
transactions = pd.read_csv("input/transactions_train.csv")
pd.set_option('display.max_columns', None)

In [65]:
# shrink customer_id, article_id 
transactions['customer_id'] = transactions['customer_id'].apply(lambda x: int(x[-16:],16)).astype('int64')
transactions['article_id'] = transactions.article_id.astype('int32')
transactions.t_dat = pd.to_datetime(transactions.t_dat)
transactions = transactions[['t_dat','customer_id','article_id']]
print( transactions.shape )
print(transactions.head())

(31788324, 3)
       t_dat          customer_id  article_id
0 2018-09-20    -6846340800584936   663713001
1 2018-09-20    -6846340800584936   541518023
2 2018-09-20 -8334631767138808638   505221004
3 2018-09-20 -8334631767138808638   685687003
4 2018-09-20 -8334631767138808638   685687004


## 0. Find Customer's latest purcahse and 2 purchases within 7 days. 

In [66]:
# Customers latest purchase date
tmp = transactions.groupby('customer_id').t_dat.max().reset_index()
tmp.columns = ['customer_id','max_dat']

In [68]:
transactions = transactions.merge(tmp, on=['customer_id'], how ='left')
# Get visit days 
transactions['diff_dat'] = (transactions.max_dat - transactions.t_dat).dt.days


In [69]:
# Get purcahse transactions within 6 days 
transactions = transactions.loc[transactions['diff_dat']<=6]
print(transactions.head())
print(transactions.shape)

        t_dat          customer_id  article_id    max_dat  diff_dat
49 2018-09-20 -5667465470176998279   649356002 2018-09-20         0
50 2018-09-20 -5667465470176998279   579941002 2018-09-20         0
51 2018-09-20 -5667465470176998279   629760002 2018-09-20         0
52 2018-09-20 -5667465470176998279   625229004 2018-09-20         0
61 2018-09-20  -232048505547517783   613456009 2018-09-20         0
(5181535, 5)


## 1. Recommend customers items which they bought the most.  

In [215]:
# load transaction data 
trans_most_df = pd.read_csv("input/transactions_train.csv")
trans_most_df = trans_most_df[['customer_id','article_id','t_dat']]
#Shrink 
trans_most_df['customer_id'] = trans_most_df['customer_id'].apply(lambda x: int(x[-16:],16)).astype('int64')
trans_most_df['article_id'] = trans_most_df.article_id.astype('int32')
trans_most_df.t_dat = pd.to_datetime(trans_most_df.t_dat)

In [216]:
trans_most_df = trans_most_df.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
trans_most_df.columns = ['customer_id','article_id','ct']
trans_most_df

Unnamed: 0,customer_id,article_id,ct
0,-9223352921020755230,539723001,1
1,-9223352921020755230,568597006,1
2,-9223352921020755230,657510001,1
3,-9223352921020755230,673396002,1
4,-9223352921020755230,706016001,3
...,...,...,...
27306434,9223357421094039679,710900001,1
27306435,9223357421094039679,747984001,2
27306436,9223357421094039679,747984002,1
27306437,9223370729206611574,679525012,1


In [219]:
# every customers most bought items 
trans_most_df = trans_most_df.sort_values('ct',ascending=False).drop_duplicates(['customer_id'])

# unique customer_id 1362281 

Unnamed: 0,customer_id,article_id,ct
22164211,5743206735920807081,678342001,570
3596714,-6801691935245814174,629420001,199
13118781,-354604980049654278,507909001,188
2929681,-7250849952617471376,570002001,170
7449486,-4193172709156082837,688558002,166
...,...,...,...
9727145,-2643262122928061301,687704022,1
9727134,-2643268431573618120,803468002,1
9726865,-2643330769870893851,936622001,1
9726867,-2643322680570466252,640735003,1


In [220]:
trans_most_df['article_id'] = '0' + trans_most_df['article_id'].astype('str')
trans_most_df

Unnamed: 0,customer_id,article_id,ct
22164211,5743206735920807081,0678342001,570
3596714,-6801691935245814174,0629420001,199
13118781,-354604980049654278,0507909001,188
2929681,-7250849952617471376,0570002001,170
7449486,-4193172709156082837,0688558002,166
...,...,...,...
9727145,-2643262122928061301,0687704022,1
9727134,-2643268431573618120,0803468002,1
9726865,-2643330769870893851,0936622001,1
9726867,-2643322680570466252,0640735003,1


In [222]:
# merge with customers 
customers_df = pd.read_csv("input/customers.csv")
customers_df = customers_df[['customer_id']]
customers_df['customer_intid'] = customers_df['customer_id'].apply(lambda x: int(x[-16:],16)).astype('int64')
# merge with output 
customers_df = customers_df.merge(trans_most_df, how='left', left_on='customer_intid', right_on ='customer_id')
customers_df = customers_df[['customer_id_x','article_id','ct']]



TypeError: 'list' object is not callable

In [224]:
customers_df.rename(columns={'customer_id_x':'customer_id'})

Unnamed: 0,customer_id,article_id,ct
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0797065001,2.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0811835004,4.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0663713001,2.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0742079001,1.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0399061015,2.0
...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0712924008,3.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0663568009,2.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0399256013,2.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0866755002,2.0


In [225]:
# output a npy
customers_df.to_csv('customer_most_bought.csv')

### 2. Recommend Items Purchased Together 
use the command drop_duplicates so that we don't recommend an item that the user has already bought and we have already recommended above. 

we will recommend previous items first and then items purchased together second.

In [184]:
# get together items 
df = pd.read_csv("input/transactions_train.csv")
df = df[['customer_id','article_id']]

In [185]:
df['customer_id'] = df['customer_id'].apply(lambda x: int(x[-16:],16)).astype('int64')
df['article_id'] = df.article_id.astype('int32')


In [186]:
# FIND ITEMS PURCHASED TOGETHER 
## show article_id counts 
vc = df.article_id.value_counts()


In [187]:
pairs = {}
# return article id index 
for j,i in enumerate(vc.index.values[:1032]):
    # for given article, get unique customer_id 
    USERS = df.loc[df.article_id==i.item(),'customer_id'].unique()

    # For given customer and item not the same 
    # The resulting object will be in descending order so that the first element is the most frequently-occurring element. 
    vc2 = df.loc[(df.customer_id.isin(USERS))&(df.article_id!=i.item()),'article_id'].value_counts()

    # get first 3 item bout together
    pairs[i.item()] = ['0'+str(vc2.index[0]), '0'+str(vc2.index[1]), '0'+str(vc2.index[2])]

In [227]:
## Output a dictionary where you can look up and ge the most bought together 
import pickle

# save dictionary to person_data.pkl file with pairs as int wiht 0 
with open('paris.pkl', 'wb') as f:
    pickle.dump(pairs, f)
    print('dictionary saved successfully to file')
## Load pickle 

dictionary saved successfully to file


## 3. last week top item recommend 

In [144]:
lastweek = transactions.loc[transactions.t_dat >= pd.to_datetime('2020-09-16')]

In [150]:
top12 = '0' + lastweek.article_id.value_counts().index.astype('str')[:12]

In [226]:
np.save('top12.npy',top12)
