In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

from collections import defaultdict

In [2]:
article_df = pd.read_csv("articles.csv", dtype={'article_id': str})

In [3]:
article_df.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [None]:
%%time
transaction_df = pd.read_csv("transactions_train.csv", 
                             dtype={'article_id': str},
                             usecols=['t_dat', 'customer_id', 'article_id'])

transaction_df = transaction_df.groupby(['customer_id', 't_dat'], as_index=False)[['article_id']].agg(list)

In [5]:
transaction_df = transaction_df[transaction_df.t_dat >= '2020-05-01']
transaction_df = transaction_df[transaction_df.t_dat < '2020-06-01']

In [6]:
transaction_df['num_articles'] = transaction_df.article_id.apply(lambda x: len(set(x)))
transaction_df = transaction_df[transaction_df.num_articles > 1]
##transaction_df = transaction_df[transaction_df.num_articles<18]
transaction_df['article_id'] = transaction_df['article_id'].apply(lambda x: list(set(x)))

transaction_df.head()

Unnamed: 0,customer_id,t_dat,article_id,num_articles
150,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,2020-05-21,"[0864716001, 0889714001, 0841383002, 083273200...",6
166,0000f2ea26b7f0a9175f428c8cf7743e9e10e193465ecd...,2020-05-26,"[0808840004, 0858640004]",2
182,0001177027259b455f979d85a278e4b280205d4de5cce4...,2020-05-04,"[0863456003, 0570002090, 0863456005]",3
183,0001177027259b455f979d85a278e4b280205d4de5cce4...,2020-05-19,"[0820462009, 0708485004]",2
184,00012315fd38859ff2c446876ca507abbcbcf582d0e266...,2020-05-30,"[0842607004, 0842607002, 0690936001]",3


In [7]:
transaction_df.num_articles.describe()

count    264011.000000
mean          4.291011
std           3.117811
min           2.000000
25%           2.000000
50%           3.000000
75%           5.000000
max          86.000000
Name: num_articles, dtype: float64

In [8]:
transaction_df.num_articles.quantile(0.99)

16.0

In [9]:
print("number of transactions:", len(transaction_df))

number of transactions: 264011


In [10]:
item_map=defaultdict(int)
for it, article_ids in enumerate(transaction_df.article_id.values):
    num_articles = len(article_ids)
    for article_id in article_ids:
        item_map[article_id] += 1
        
item_df = pd.DataFrame.from_dict({
    'item': item_map.keys(),
    'freq': item_map.values()
})
item_df.head()

Unnamed: 0,item,freq
0,864716001,474
1,889714001,29
2,841383002,1257
3,832732003,239
4,832482001,534


In [11]:
item_df_popular =  item_df.sort_values(['freq','item'], ascending=[False,True])
item_df_popular[:10]

Unnamed: 0,item,freq
733,684209027,2003
686,599580038,1897
211,599580052,1845
1479,741356002,1827
43,688537004,1775
426,759871002,1768
132,610776002,1718
732,688537021,1609
714,854683002,1586
254,610776001,1575


In [12]:
item_df = item_df[item_df.freq > 20]
candidate_items = set(item_df.item.values)

In [13]:
len(candidate_items)

8900

In [14]:
%%time
pair_map={}
for it, article_ids in enumerate(transaction_df.article_id.values):
    if it%1000000 == 0:
        print(it)
    num_articles = len(article_ids)
    for i in range(num_articles):
        item1 = article_ids[i]
        if item1 not in candidate_items:
            continue
            
        for j in range(i+1, num_articles):
            item2 = article_ids[j]
            if item2 not in candidate_items:
                continue
                
            if item1 not in pair_map:
                pair_map[item1] = {}
            if item2 not in pair_map:
                pair_map[item2] = {}
            
            if item2 not in pair_map[item1]:
                pair_map[item1][item2] = 0
            if item1 not in pair_map[item2]:
                pair_map[item2][item1] = 0
            
            pair_map[item1][item2] += 1
            pair_map[item2][item1] += 1

0
CPU times: user 4.37 s, sys: 115 ms, total: 4.49 s
Wall time: 4.65 s


In [15]:
item1 = []
item2 = []
freq  = []

for i1 in pair_map.keys():
    for i2 in pair_map[i1].keys():
        v = pair_map[i1][i2]
        if v <= 20:
            continue
        item1.append(i1)
        item2.append(i2)
        freq.append(v)

pair_df = pd.DataFrame.from_dict({ 'item1': item1, 'item2': item2, 'joint_freq': freq})
pair_df['item_freq1'] = pair_df.item1.apply(lambda k: item_map[k])
pair_df['item_freq2'] = pair_df.item2.apply(lambda k: item_map[k])
pair_df.head()

Unnamed: 0,item1,item2,joint_freq,item_freq1,item_freq2
0,864716001,854677004,53,474,1417
1,864716001,854679003,42,474,957
2,864716001,854679002,35,474,959
3,864716001,854679005,37,474,988
4,864716001,854677003,48,474,1462


In [16]:
pair_df = pair_df[pair_df.joint_freq>20]
pair_df['confidence'] = pair_df.joint_freq.div(pair_df['item_freq1'])
pair_df = pair_df.sort_values(['item1', 'confidence'], ascending=[True, False])

pair_df = pair_df.groupby('item1').head(10)
pair_df.head()

Unnamed: 0,item1,item2,joint_freq,item_freq1,item_freq2,confidence
3106,160442007,160442010,95,366,637,0.259563
3107,160442007,160442043,35,366,147,0.095628
3188,160442010,160442007,95,637,366,0.149137
3190,160442010,160442043,45,637,147,0.070644
3189,160442010,372860001,21,637,1195,0.032967


In [17]:
print("number of pairs:", len(pair_df)//2)

number of pairs: 2684


In [18]:
pair_df.confidence.describe()

count    5369.000000
mean        0.154674
std         0.164152
min         0.015075
25%         0.061224
50%         0.098991
75%         0.172107
max         0.955224
Name: confidence, dtype: float64

In [19]:
pair_df.head()

Unnamed: 0,item1,item2,joint_freq,item_freq1,item_freq2,confidence
3106,160442007,160442010,95,366,637,0.259563
3107,160442007,160442043,35,366,147,0.095628
3188,160442010,160442007,95,637,366,0.149137
3190,160442010,160442043,45,637,147,0.070644
3189,160442010,372860001,21,637,1195,0.032967


## Predict

In [20]:
transaction_df = transaction_df.groupby('customer_id', as_index=False)[['article_id']].agg(list)
transaction_df.head()

Unnamed: 0,customer_id,article_id
0,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,"[[0864716001, 0889714001, 0841383002, 08327320..."
1,0000f2ea26b7f0a9175f428c8cf7743e9e10e193465ecd...,"[[0808840004, 0858640004]]"
2,0001177027259b455f979d85a278e4b280205d4de5cce4...,"[[0863456003, 0570002090, 0863456005], [082046..."
3,00012315fd38859ff2c446876ca507abbcbcf582d0e266...,"[[0842607004, 0842607002, 0690936001]]"
4,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,"[[0842605015, 0762846007]]"


In [21]:
def get_recommended_items(articles):
    df = pair_df[pair_df.item1.isin(articles)]
    df.groupby('item2', as_index=False)[['joint_freq']].sum().sort_values('joint_freq', ascending=False)
    df = df.head(10)
    items = df.item2.values
    items = ' '.join(items)
    return items

In [22]:
final_candidate_items = set(pair_df.item1.values)
print(len(final_candidate_items))

1884


In [36]:
preds=[]
for it, row in transaction_df.iterrows():
    customer_id = row.customer_id
    articles = []
    for article_lst in row.article_id:
        articles += article_lst
    articles = set(articles)
    cur_articles = final_candidate_items.intersection(articles)
    
    if len(cur_articles) == 0:
        continue
        
    pred_items = get_recommended_items(cur_articles)
    preds.append({
        'customer_id': customer_id,
        'prediction': pred_items
    })
    if it%100000==0:
        print(it)
pred_df = pd.DataFrame.from_dict(preds)
pred_df.head()

0
100000


Unnamed: 0,customer_id,prediction
0,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,0749699002 0749699008 0821163008 0716672001 07...
1,00012315fd38859ff2c446876ca507abbcbcf582d0e266...,0690936006 0690936002 0690936003 0690936021
2,00015c1a121e08bbd2552c15fbbb6e6b19d3bf8f7b6a3d...,0842605001
3,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,0825720005 0852746007 0825720003 0825720004 08...
4,00025f8226be50dcab09402a2cacd520a99e112fe01fdd...,0762286004 0762286002 0844411003 0844411001


In [37]:
test_df = pd.read_csv("H&MDatasetFinal1.csv",
                             usecols=['customer_id', 'Y'])

In [38]:
test_df = test_df[test_df.Y == 2.0]

In [39]:
customer_list = []
for customer_id in test_df.customer_id:
    customer_list.append(customer_id)
customer_set = set(customer_list)

In [40]:
%%time
transaction_df_test = pd.read_csv("transactions_train.csv", 
                             dtype={'article_id': str},
                             usecols=['t_dat', 'customer_id', 'article_id'])

CPU times: user 22.3 s, sys: 3.27 s, total: 25.6 s
Wall time: 26.3 s


In [41]:
transaction_df_test = transaction_df_test[transaction_df_test.t_dat >= '2020-06-01']
transaction_df_test = transaction_df_test[transaction_df_test.t_dat < '2020-10-01']

In [42]:
transaction_df_test = transaction_df_test[transaction_df_test.customer_id.isin(customer_set)]
transaction_df_test = transaction_df_test.groupby(['customer_id'], as_index=False)[['article_id']].agg(list)

In [43]:
transaction_df_test[:10]

Unnamed: 0,customer_id,article_id
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,[0826211002]
1,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,"[0848460002, 0848460002, 0848460002, 085274600..."
2,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,"[0917434002, 0685814048, 0895418003, 087546900..."
3,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,"[0842792001, 0859118002, 0886540001, 086942400..."
4,00023e3dd8618bc63ccad995a5ac62e21177338d642d66...,"[0861608003, 0875469004, 0733097008]"
5,0002cca4cc68601e894ab62839428e5f0696417fe0f9e8...,[0910601002]
6,0002d5b37e064c0da06d559a51a082746ad3893d11a086...,"[0856310002, 0856919005, 0910701003, 086216700..."
7,0002e6cdaab622b5047407efc0d0bf85e23220e0920120...,"[0849257001, 0812207003, 0510420008, 083095300..."
8,00061a04f030bdf3665b09829192ca8c13c4de6dd9ae9d...,"[0572797001, 0877429001, 0690936006, 080613100..."
9,0006bb0fad5c49341bd9cece264271e68e01a4e55f22ec...,"[0510283002, 0812659001, 0736581001, 080297900..."


In [44]:
new_df = pred_df.merge(transaction_df_test)

In [45]:
new_df

Unnamed: 0,customer_id,prediction,article_id
0,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,0749699002 0749699008 0821163008 0716672001 07...,"[0917434002, 0685814048, 0895418003, 087546900..."
1,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,0825720005 0852746007 0825720003 0825720004 08...,"[0842792001, 0859118002, 0886540001, 086942400..."
2,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,0372860001 0723469001 0704754001 0748355002 07...,"[0832505001, 0902992001, 0556260001, 057954107..."
3,0008d644deb96bdc0ca262f161cf6d5e9a4e619bb75faa...,0861712001 0857272001 0865533001 0875951002 08...,"[0869424001, 0855262002, 0908729002, 0237347060]"
4,000934651054f08396856cd83fad3b36b97ab95a0baf79...,0610776001 0610776083 0610776103 0610776072 05...,"[0750480004, 0790686006, 0841674002, 076327500..."
...,...,...,...
72510,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,0506098007 0723469001 0253448003 0652924004 08...,"[0652924041, 0877599001, 0877599001, 055459806..."
72511,fffb68e203e88449a1dc7173e938b1b3e91b0c93ff4e1d...,0751551001 0751551004 0874465005 0874320001 07...,"[0723469002, 0717490058, 0834217009, 081642300..."
72512,fffb834e3b357155d4f72274f3621f68db9c4bac221851...,0841960003 0841960004,"[0706268031, 0783707041, 0783707037, 084808200..."
72513,fffe7116f9f68e8ad287fd7b6e33aad4871d7080e77d2d...,0717490059 0717490081 0610776072 0610776002 07...,"[0783346020, 0783346024, 0808698001, 087703700..."


In [46]:
correct = 0
incorrect = 0
for i in range(len(new_df)):
    for v in new_df['prediction'][i].split():
        if v in new_df['article_id'][i]:
            correct += 1
        else:
            incorrect += 1

In [34]:
print(correct)
print(incorrect)

9669
532486


In [55]:
accuracy_list = []
for i in range(new_df.shape[0]):
    if isinstance(new_df['article_id'][i], list)==False:
        accuracy=0
    else:
        list1 =  list(new_df['prediction'][i].split())
        list2 = list(new_df['article_id'][i])
        accuracy = len([i for i in list1 if i in list2])/len(list1)
    accuracy_list.append(accuracy)

new_df['accuracy'] = np.array(accuracy_list)

In [56]:
new_df

Unnamed: 0,customer_id,prediction,article_id,accuracy
0,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,0749699002 0749699008 0821163008 0716672001 07...,"[0917434002, 0685814048, 0895418003, 087546900...",0.000000
1,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,0825720005 0852746007 0825720003 0825720004 08...,"[0842792001, 0859118002, 0886540001, 086942400...",0.000000
2,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,0372860001 0723469001 0704754001 0748355002 07...,"[0832505001, 0902992001, 0556260001, 057954107...",0.000000
3,0008d644deb96bdc0ca262f161cf6d5e9a4e619bb75faa...,0861712001 0857272001 0865533001 0875951002 08...,"[0869424001, 0855262002, 0908729002, 0237347060]",0.000000
4,000934651054f08396856cd83fad3b36b97ab95a0baf79...,0610776001 0610776083 0610776103 0610776072 05...,"[0750480004, 0790686006, 0841674002, 076327500...",0.100000
...,...,...,...,...
72510,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,0506098007 0723469001 0253448003 0652924004 08...,"[0652924041, 0877599001, 0877599001, 055459806...",0.111111
72511,fffb68e203e88449a1dc7173e938b1b3e91b0c93ff4e1d...,0751551001 0751551004 0874465005 0874320001 07...,"[0723469002, 0717490058, 0834217009, 081642300...",0.100000
72512,fffb834e3b357155d4f72274f3621f68db9c4bac221851...,0841960003 0841960004,"[0706268031, 0783707041, 0783707037, 084808200...",0.000000
72513,fffe7116f9f68e8ad287fd7b6e33aad4871d7080e77d2d...,0717490059 0717490081 0610776072 0610776002 07...,"[0783346020, 0783346024, 0808698001, 087703700...",0.000000


In [57]:
new_df.accuracy.describe()

count    72515.000000
mean         0.018657
std          0.075082
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: accuracy, dtype: float64

In [58]:
new_df[new_df['accuracy']!= 0]

Unnamed: 0,customer_id,prediction,article_id,accuracy
4,000934651054f08396856cd83fad3b36b97ab95a0baf79...,0610776001 0610776083 0610776103 0610776072 05...,"[0750480004, 0790686006, 0841674002, 076327500...",0.100000
8,000c5c714aefd0d5ed1205e2781070167826ffc117ab9e...,0399256037 0399256001 0636323002 0399256023 05...,"[0579541072, 0869691001, 0579541001, 090195500...",0.100000
11,000fb6e772c5d0023892065e659963da90b1866035558e...,0572797041 0572797001 0841383003 0841383002 05...,"[0865470002, 0832732001, 0900382001, 091435100...",0.100000
28,001ddeb8fb74fec5693116da83b488e05ee9a9e179f3fd...,0706016015 0706016002 0706016019 0706016006 07...,"[0706016002, 0706016053, 0842000001, 084579000...",0.100000
30,001f5299820c00df306221ff581abf9d18507c2e35ecb3...,0832361001 0832361003 0832361007 0832362002 08...,"[0821336004, 0872453002, 0818031002, 088492000...",0.100000
...,...,...,...,...
72484,ffeb041f188b71de1b7354e8fa0369c14c22a1b4d5f55e...,0749699002 0749699008 0821163008 0716672001 07...,"[0870970001, 0870970001, 0887830002, 085816100...",0.100000
72489,ffedd10bbc166ed253113951a1c028389064df97a48198...,0810172002 0814817002 0838900002 0838900003 08...,"[0814817001, 0857812003, 0850906001, 086209200...",0.400000
72497,fff2c4204fac63f93aec10ed657958d372efe948de1492...,0470789031 0863515004,"[0826211001, 0470789031, 0253448062, 080069100...",0.500000
72510,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,0506098007 0723469001 0253448003 0652924004 08...,"[0652924041, 0877599001, 0877599001, 055459806...",0.111111


In [59]:
new_df[new_df['accuracy']!= 0].accuracy.describe()

count    7228.000000
mean        0.187178
std         0.158164
min         0.100000
25%         0.100000
50%         0.100000
75%         0.200000
max         1.000000
Name: accuracy, dtype: float64