__Imports__

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

__Read in Data__

In [2]:
train = pd.read_csv("C:/Users/Michael/Github/transactions_train.csv")
train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


__Converting Hex to Integers__

In [3]:
def hex_to_signed(source):
    """Convert a string hex value to a signed hexidecimal value.

    This assumes that source is the proper length, and the sign bit
    is the first bit in the first byte of the correct length.

    hex_to_signed("F") should return -1.
    hex_to_signed("0F") should return 15.
    """
    if not isinstance(source, str):
        raise ValueError("string type required")
    if 0 == len(source):
        raise valueError("string is empty")
    sign_bit_mask = 1 << (len(source)*4-1)
    other_bits_mask = sign_bit_mask - 1
    value = int(source, 16)
    return -(value & sign_bit_mask) | (value & other_bits_mask)

__Selecting and formating data types in dataframe__

In [4]:
train['customer_id'] = train['customer_id'].apply(lambda x : hex_to_signed(x))
train['article_id'] = train.article_id.astype('int32')
train['t_dat'] = pd.to_datetime(train['t_dat'])
train = train[['t_dat','customer_id','article_id']]
train.head()

Unnamed: 0,t_dat,customer_id,article_id
0,2018-09-20,6116990065284462810764946559699987067290722768...,663713001
1,2018-09-20,6116990065284462810764946559699987067290722768...,541518023
2,2018-09-20,8639559392216894130813047803836240585733025821...,505221004
3,2018-09-20,8639559392216894130813047803836240585733025821...,685687003
4,2018-09-20,8639559392216894130813047803836240585733025821...,685687004


__Saving this for later__

In [5]:
another_train = train

__Customer's Last Week Purchases__

In [6]:
tmp = train.groupby('customer_id').t_dat.max().reset_index()
tmp.columns = ['customer_id','max_dat']
train = train.merge(tmp,on=['customer_id'],how='left')
train['diff_dat'] = (train.max_dat - train.t_dat).dt.days
train = train.loc[train['diff_dat']<=6]
print('Train shape:',train.shape)
train.head()

Train shape: (5181535, 5)


Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat
49,2018-09-20,5981405501434810419031299841272056822028636268...,649356002,2018-09-20,0
50,2018-09-20,5981405501434810419031299841272056822028636268...,579941002,2018-09-20,0
51,2018-09-20,5981405501434810419031299841272056822028636268...,629760002,2018-09-20,0
52,2018-09-20,5981405501434810419031299841272056822028636268...,625229004,2018-09-20,0
61,2018-09-20,1132591264840105692788114398749857557049430095...,613456009,2018-09-20,0


__Recommendation Most Often Prev Purchased Items__

In [7]:
tmp = train.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
tmp.columns = ['customer_id','article_id','ct']
train = train.merge(tmp,on=['customer_id','article_id'],how='left')
train = train.sort_values(['ct','t_dat'],ascending=False)
train = train.drop_duplicates(['customer_id','article_id'])
train = train.sort_values(['ct','t_dat'],ascending=False)
train.head()

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct
1158548,2019-07-16,1122880311838375530037224418800749068125981861...,719348003,2019-07-16,0,100
80218,2018-10-04,-673191760722678788667587928976895833688046975...,557247001,2018-10-04,0,86
2149868,2020-03-06,9249039390400736591087536058610073983058079884...,852521001,2020-03-06,0,81
3423153,2020-07-06,-518177034797192986786595171574626635942011784...,685813001,2020-07-06,0,80
871769,2019-05-14,1283729545703826238940687917472870211201319841...,695545001,2019-05-14,0,80


__Recommend Items Purchased Together__

# USE PANDAS TO MAP COLUMN WITH DICTIONARY

In [8]:
pairs = np.load('pairs.npy',allow_pickle=True).item()
train['article_id2'] = train.article_id.map(pairs)
train.head()

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct,article_id2
1158548,2019-07-16,1122880311838375530037224418800749068125981861...,719348003,2019-07-16,0,100,719348001.0
80218,2018-10-04,-673191760722678788667587928976895833688046975...,557247001,2018-10-04,0,86,557247003.0
2149868,2020-03-06,9249039390400736591087536058610073983058079884...,852521001,2020-03-06,0,81,610776002.0
3423153,2020-07-06,-518177034797192986786595171574626635942011784...,685813001,2020-07-06,0,80,685813002.0
871769,2019-05-14,1283729545703826238940687917472870211201319841...,695545001,2019-05-14,0,80,573716012.0


# RECOMMENDATION OF PAIRED ITEMS

In [9]:
train2 = train[['customer_id','article_id2']].copy()
train2 = train2.loc[train2.article_id2.notnull()]
train2 = train2.drop_duplicates(['customer_id','article_id2'])
train2 = train2.rename({'article_id2':'article_id'},axis=1)
train2.head()

Unnamed: 0,customer_id,article_id
1158548,1122880311838375530037224418800749068125981861...,719348001.0
80218,-673191760722678788667587928976895833688046975...,557247003.0
2149868,9249039390400736591087536058610073983058079884...,610776002.0
3423153,-518177034797192986786595171574626635942011784...,685813002.0
871769,1283729545703826238940687917472870211201319841...,573716012.0


# CONCATENATE PAIRED ITEM RECOMMENDATION AFTER PREVIOUS PURCHASED RECOMMENDATIONS

In [10]:
train = train[['customer_id','article_id']]
train = pd.concat([train,train2],axis=0,ignore_index=True)
train.article_id = train.article_id.astype('int32')
train = train.drop_duplicates(['customer_id','article_id'])
train.head()

Unnamed: 0,customer_id,article_id
0,1122880311838375530037224418800749068125981861...,719348003
1,-673191760722678788667587928976895833688046975...,557247001
2,9249039390400736591087536058610073983058079884...,852521001
3,-518177034797192986786595171574626635942011784...,685813001
4,1283729545703826238940687917472870211201319841...,695545001


# CONVERT RECOMMENDATIONS INTO SINGLE STRING

In [11]:
train.article_id = ' 0' + train.article_id.astype('str')
preds = pd.DataFrame( train.groupby('customer_id').article_id.sum().reset_index() )
preds.columns = ['customer_id','prediction']
preds.head()

Unnamed: 0,customer_id,prediction
0,-578959872699432204840272251318678134742912003...,0918522001 0915526001
1,-578958955106318985703340409806304793781964660...,0918443004 0158340001
2,-578958851022090388530167605110184806675590806...,0734460001 0898983001 0469039019 0706016001 0...
3,-578958530827022703791934662315899566725724297...,0429313023 0429313025 0679284001 0429313008 0...
4,-578954918733087932666888089058340064531866922...,0696527001 0266875001 0554772041 0617534016 0...


In [13]:
another_train.t_dat = pd.to_datetime(another_train.t_dat)
another_train = another_train.loc[another_train.t_dat >= pd.to_datetime('2020-09-16')]
top12 = ' 0' + ' 0'.join(another_train.article_id.value_counts().index.astype('str')[:12])
print("Last week's top 12 popular items:")
print( top12 )

Last week's top 12 popular items:
 0924243001 0924243002 0918522001 0923758001 0866731001 0909370001 0751471001 0915529003 0915529005 0448509014 0762846027 0714790020


In [14]:
sub = pd.read_csv("C:/Users/Michael/Github/sample_submission.csv")
sub = sub[['customer_id']]
sub['customer_id_2'] = sub['customer_id'].apply(lambda x : hex_to_signed(x))
sub = sub.merge(preds.rename({'customer_id':'customer_id_2'},axis=1),on='customer_id_2', how='left').fillna('')

del sub['customer_id_2']
sub.prediction = sub.prediction + top12
sub.prediction = sub.prediction.str.strip()
sub.prediction = sub.prediction.str[:131]
#sub.to_csv(f'submission.csv',index=False)
sub.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0924243001 0924243002 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0706016001 0924243001 0924243002 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0805000001 0924243001 0924243002 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0742079001 0732413001 0730683001 0372860001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152002 0730683050 0927530004 0791587015 08...


In [16]:
sub.to_csv(f'submission_04_30_2022.csv',index=False)