# Fabric importance
In this notebook a train test based on weeks 50 and 51 (previous year) is created, and tested on week 104.

### Import of libraries and datasets

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
articles = pd.read_parquet("../generated_data/articles_with_fabric_namenum.parquet")
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,fabric_num,fabric_name
0,108775015,108775,12855,253,9,0,1010016,0,9,0,...,0,1,0,16,30,1002,2,8834,[0],[jersey]
1,108775044,108775,12855,253,9,0,1010016,0,10,2,...,0,1,0,16,30,1002,2,8834,[0],[jersey]
2,108775051,108775,44846,253,9,0,1010017,3,11,11,...,0,1,0,16,30,1002,2,8834,[0],[jersey]
3,110065001,110065,8159,306,13,4,1010016,0,9,0,...,7,1,0,61,5,1017,4,8243,[1],[microfibre]
4,110065002,110065,8159,306,13,4,1010016,0,10,2,...,7,1,0,61,5,1017,4,8243,[1],[microfibre]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,24043,302,14,7,1010014,6,9,0,...,2,3,3,26,16,1021,13,28026,"[4, 13, 3]","[cotton, knit, elastic]"
105538,953763001,953763,24027,253,9,0,1010016,0,9,0,...,0,1,0,2,15,1005,0,28025,[8],[polyester]
105539,956217002,956217,24040,265,1,2,1010016,0,9,0,...,0,1,0,18,12,1005,0,28024,[0],[jersey]
105540,957375001,957375,24041,72,48,3,1010016,0,9,0,...,1,2,2,52,25,1019,1,28023,"[5, 6]","[plastic, hair]"


In [3]:
a2 = pd.read_csv("../data/articles.csv")

In [4]:
style_cols = ["graphical_appearance_name", "perceived_colour_master_name", "section_name", "index_group_name"]
for col in style_cols:
    print(col)
    print(a2[col].unique())
    print()

graphical_appearance_name
['Solid' 'Stripe' 'All over pattern' 'Melange' 'Transparent' 'Metallic'
 'Application/3D' 'Denim' 'Colour blocking' 'Dot' 'Other structure'
 'Contrast' 'Treatment' 'Check' 'Chambray' 'Front print'
 'Glittering/Metallic' 'Mixed solid/pattern' 'Placement print'
 'Other pattern' 'Neps' 'Embroidery' 'Lace' 'Jacquard' 'Unknown' 'Argyle'
 'Slub' 'Mesh' 'Sequin' 'Hologram']

perceived_colour_master_name
['Black' 'White' 'Beige' 'Grey' 'Blue' 'Pink' 'Lilac Purple' 'Red' 'Mole'
 'Orange' 'Metal' 'Brown' 'Turquoise' 'Yellow' 'Khaki green' 'Green'
 'undefined' 'Unknown' 'Yellowish Green' 'Bluish Green']

section_name
['Womens Everyday Basics' 'Womens Lingerie'
 'Womens Nightwear, Socks & Tigh' 'Baby Essentials & Complements'
 'Men Underwear' 'Mama' 'Womens Small accessories' 'Men H&M Sport'
 'Kids Boy' 'Divided Basics' 'Girls Underwear & Basics' 'Mens Outerwear'
 'Womens Big accessories' 'Divided Accessories'
 'Womens Swimwear, beachwear' 'Divided Selected' 'Boys Underwe

In [5]:
customers = pd.read_parquet("../data/customers.parquet")
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,6883939031699146327,-1,-1,0,0,49,6305
1,11246327431398957306,-1,-1,0,0,25,33726
2,18439897732908966680,-1,-1,0,0,24,3247
3,18352672461570950206,-1,-1,0,0,54,168643
4,18162778555210377306,1,1,0,1,52,168645
...,...,...,...,...,...,...,...
1371975,7551062398649767985,-1,-1,0,0,24,50351
1371976,9305341941720086711,-1,-1,0,0,21,80169
1371977,10160427316885688932,1,1,0,1,21,106737
1371978,2551401172826382186,1,1,0,1,18,111894


In [6]:
transactions = pd.read_parquet("../data/transactions_train.parquet")
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
25784,2018-09-20,1728846800780188,519773001,0.028458,2,0
25785,2018-09-20,1728846800780188,578472001,0.032525,2,0
5389,2018-09-20,2076973761519164,661795002,0.167797,2,0
5390,2018-09-20,2076973761519164,684080003,0.101678,2,0
47429,2018-09-20,2918879973994241,662980001,0.033881,1,0
...,...,...,...,...,...,...
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,104
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,104
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,104
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,104


In [7]:
tr = transactions.merge(articles, on="article_id")
tr

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,product_code,prod_name,product_type_no,product_type_name,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,fabric_num,fabric_name
0,2018-09-20,1728846800780188,519773001,0.028458,2,0,519773,7147,245,17,...,0,1,0,15,0,1003,3,10231,"[13, 24]","[knit, wool]"
1,2018-09-20,12679047700498299353,519773001,0.035576,2,0,519773,7147,245,17,...,0,1,0,15,0,1003,3,10231,"[13, 24]","[knit, wool]"
2,2018-09-20,17920410454051211922,519773001,0.035576,2,0,519773,7147,245,17,...,0,1,0,15,0,1003,3,10231,"[13, 24]","[knit, wool]"
3,2018-09-21,12044778652893604504,519773001,0.035576,2,0,519773,7147,245,17,...,0,1,0,15,0,1003,3,10231,"[13, 24]","[knit, wool]"
4,2018-09-21,12044778652893604504,519773001,0.035576,2,0,519773,7147,245,17,...,0,1,0,15,0,1003,3,10231,"[13, 24]","[knit, wool]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31788319,2020-09-22,17026193828065643786,533261032,0.033881,2,104,533261,280,256,29,...,5,4,1,44,2,1002,2,185,"[4, 0]","[cotton, jersey]"
31788320,2020-09-22,17104273831947945561,875646001,0.033881,2,104,875646,3070,272,0,...,3,4,1,46,8,1009,5,31807,"[4, 3, 22]","[cotton, elastic, twill]"
31788321,2020-09-22,17843582181630128591,757903026,0.015237,1,104,757903,357,80,28,...,6,1,0,65,29,1019,1,166,[-1],
31788322,2020-09-22,17970366705591601158,892893001,0.135576,2,104,892893,29006,252,2,...,0,1,0,14,36,1003,3,27251,"[13, 45, 24]","[knit, cashmere, wool]"


In [8]:
#Limit transactions to THIS FALL:
#TRAIN = WEEK 50 - 51

tr_this_fall = tr[(tr.week >= 102-52) & (tr.week <= 103-52)]
print(tr_this_fall.shape)

(510166, 32)


In [9]:
tr_this_fall.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,product_code,prod_name,product_type_no,product_type_name,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,fabric_num,fabric_name
4086,2019-09-05,2832662598733435800,662980001,0.022017,2,50,662980,2110,252,2,...,0,1,0,15,0,1003,3,8172,"[13, 24]","[knit, wool]"
10381,2019-09-14,9697704321606698314,688873001,0.033881,1,51,688873,131,258,5,...,0,1,0,11,7,1010,6,22,[-1],
10382,2019-09-16,7882505936672743154,688873001,0.033881,1,51,688873,131,258,5,...,0,1,0,11,7,1010,6,22,[-1],
10621,2019-09-06,7474721472941889727,673911002,0.050831,2,50,673911,11338,94,16,...,6,1,0,64,20,1020,7,7204,"[3, 16]","[elastic, suede]"
10622,2019-09-10,9167082501483725962,673911002,0.050831,2,50,673911,11338,94,16,...,6,1,0,64,20,1020,7,7204,"[3, 16]","[elastic, suede]"


In [10]:
cus_this_fall = tr_this_fall["customer_id"].unique().tolist()
len(cus_this_fall)

125320

In [11]:
art_this_fall = tr_this_fall["article_id"].unique().tolist()
len(art_this_fall)

24844

#### ARTICLES MATRIX

In [12]:
index_to_art_id = pd.DataFrame({'article_id': art_this_fall})

In [13]:
index_to_art_id.sort_values(by='article_id')
index_to_art_id['original_index'] = index_to_art_id.index
index_to_art_id.head()

Unnamed: 0,article_id,original_index
0,662980001,0
1,688873001,1
2,673911002,2
3,676352001,3
4,615141003,4


In [14]:
articles_this_fall = articles[articles['article_id'].isin(art_this_fall)]

In [15]:
articles_this_fall_fabric = articles_this_fall[['article_id', 'fabric_num']]
articles_this_fall_style = articles_this_fall[["article_id", "graphical_appearance_name", "perceived_colour_master_name", "section_name", "index_group_name"]]

print("training")
print(articles_this_fall_fabric.shape)
print(articles_this_fall_style.shape)


training
(24844, 2)
(24844, 5)


In [16]:
from sklearn.preprocessing import MultiLabelBinarizer

#I make sure that every value inside fabric num lists is counted as a possible value
mlb = MultiLabelBinarizer(sparse_output=True)

articles_this_fall_fabric = articles_this_fall_fabric.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(articles_this_fall_fabric.pop('fabric_num')),
                index=articles_this_fall_fabric.index,
                columns=mlb.classes_))

articles_this_fall_fabric.head()

Unnamed: 0,article_id,-1,0,1,2,3,4,5,6,7,...,72,75,77,79,80,81,82,84,87,88
0,108775015,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,108775044,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,110065001,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,110065002,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,110065011,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
#remove article id
articles_this_fall_fabric = articles_this_fall_fabric.drop(['article_id'], axis=1)


In [18]:
one_hot_encoded = pd.get_dummies(articles_this_fall[['graphical_appearance_name', 'perceived_colour_master_name', 'section_name', 'index_group_name']], columns=['graphical_appearance_name', 'perceived_colour_master_name', 'section_name', 'index_group_name'])
one_hot_encoded = one_hot_encoded.astype(int)
one_hot_encoded

Unnamed: 0,graphical_appearance_name_0,graphical_appearance_name_1,graphical_appearance_name_2,graphical_appearance_name_3,graphical_appearance_name_4,graphical_appearance_name_5,graphical_appearance_name_6,graphical_appearance_name_7,graphical_appearance_name_8,graphical_appearance_name_9,...,section_name_51,section_name_52,section_name_53,section_name_54,section_name_55,index_group_name_0,index_group_name_1,index_group_name_2,index_group_name_3,index_group_name_4
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90483,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
90484,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
90490,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
90728,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [19]:
merged_matrix_art = one_hot_encoded.join(articles_this_fall_fabric)

In [20]:
merged_matrix_art = csr_matrix(merged_matrix_art.values)
print(merged_matrix_art)

  (0, 0)	1
  (0, 29)	1
  (0, 78)	1
  (0, 104)	1
  (0, 110)	1
  (1, 0)	1
  (1, 31)	1
  (1, 78)	1
  (1, 104)	1
  (1, 110)	1
  (2, 0)	1
  (2, 29)	1
  (2, 53)	1
  (2, 104)	1
  (2, 111)	1
  (3, 0)	1
  (3, 31)	1
  (3, 53)	1
  (3, 104)	1
  (3, 111)	1
  (4, 0)	1
  (4, 35)	1
  (4, 53)	1
  (4, 104)	1
  (4, 111)	1
  :	:
  (24839, 114)	1
  (24840, 8)	1
  (24840, 33)	1
  (24840, 52)	1
  (24840, 105)	1
  (24840, 130)	1
  (24841, 7)	1
  (24841, 29)	1
  (24841, 52)	1
  (24841, 105)	1
  (24841, 113)	1
  (24841, 123)	1
  (24841, 133)	1
  (24841, 142)	1
  (24842, 0)	1
  (24842, 31)	1
  (24842, 57)	1
  (24842, 104)	1
  (24842, 116)	1
  (24842, 119)	1
  (24843, 0)	1
  (24843, 35)	1
  (24843, 60)	1
  (24843, 104)	1
  (24843, 138)	1


In [21]:
print(merged_matrix_art.shape)

(24844, 188)


#### MATRIX FOR CUSTOMER STYLE

In [22]:
index_to_cus_id = pd.DataFrame({'customer_id': cus_this_fall})

In [23]:
index_to_cus_id.sort_values(by='customer_id')
index_to_cus_id['original_index'] = index_to_cus_id.index
index_to_cus_id.head()

Unnamed: 0,customer_id,original_index
0,2832662598733435800,0
1,9697704321606698314,1
2,7882505936672743154,2
3,7474721472941889727,3
4,9167082501483725962,4


In [24]:
customers_this_fall_fabric = tr_this_fall[['customer_id', 'fabric_num']]
customers_this_fall_style = tr_this_fall[["customer_id", "graphical_appearance_name", "perceived_colour_master_name", "section_name", "index_group_name"]]

print("training")
print(customers_this_fall_fabric.shape)
print(customers_this_fall_style.shape)


training
(510166, 2)
(510166, 5)


In [25]:
print(index_to_cus_id.shape)

(125320, 2)


In [26]:
from sklearn.preprocessing import MultiLabelBinarizer

#I make sure that every value inside fabric num lists is counted as a possible value
mlb = MultiLabelBinarizer(sparse_output=True)

customers_this_fall_fabric = customers_this_fall_fabric.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(customers_this_fall_fabric.pop('fabric_num')),
                index=customers_this_fall_fabric.index,
                columns=mlb.classes_))

customers_this_fall_fabric.head()

Unnamed: 0,customer_id,-1,0,1,2,3,4,5,6,7,...,72,75,77,79,80,81,82,84,87,88
4086,2832662598733435800,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10381,9697704321606698314,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10382,7882505936672743154,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10621,7474721472941889727,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10622,9167082501483725962,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
customers_this_fall_fabric = customers_this_fall_fabric.groupby('customer_id').sum()
customers_this_fall_fabric.head()

Unnamed: 0_level_0,-1,0,1,2,3,4,5,6,7,8,...,72,75,77,79,80,81,82,84,87,88
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77117344919861,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200292573348128,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272412481300040,1,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
375055163245029,0,2,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388813703204080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
customers_this_fall_fabric.sort_values(by='customer_id')

Unnamed: 0_level_0,-1,0,1,2,3,4,5,6,7,8,...,72,75,77,79,80,81,82,84,87,88
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77117344919861,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200292573348128,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272412481300040,1,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
375055163245029,0,2,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388813703204080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18445187566593112488,10,1,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18445401152158227288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18445934941549932698,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18446384416408490004,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
sparse_cus_fabric = csr_matrix(customers_this_fall_fabric.values)
sparse_cus_fabric

<125320x79 sparse matrix of type '<class 'numpy.intc'>'
	with 433824 stored elements in Compressed Sparse Row format>

In [30]:
one_hot_encoded = pd.get_dummies(tr_this_fall[['graphical_appearance_name', 'perceived_colour_master_name', 'section_name', 'index_group_name']], columns=['graphical_appearance_name', 'perceived_colour_master_name', 'section_name', 'index_group_name'])
one_hot_encoded = one_hot_encoded.astype(int)
one_hot_encoded

Unnamed: 0,graphical_appearance_name_0,graphical_appearance_name_1,graphical_appearance_name_2,graphical_appearance_name_3,graphical_appearance_name_4,graphical_appearance_name_5,graphical_appearance_name_6,graphical_appearance_name_7,graphical_appearance_name_8,graphical_appearance_name_9,...,section_name_51,section_name_52,section_name_53,section_name_54,section_name_55,index_group_name_0,index_group_name_1,index_group_name_2,index_group_name_3,index_group_name_4
4086,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10381,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10382,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10621,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10622,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22974363,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
22974607,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
22974700,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
22974704,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [31]:
one_hot_encoded['customer_id'] = tr_this_fall['customer_id']
one_hot_encoded

Unnamed: 0,graphical_appearance_name_0,graphical_appearance_name_1,graphical_appearance_name_2,graphical_appearance_name_3,graphical_appearance_name_4,graphical_appearance_name_5,graphical_appearance_name_6,graphical_appearance_name_7,graphical_appearance_name_8,graphical_appearance_name_9,...,section_name_52,section_name_53,section_name_54,section_name_55,index_group_name_0,index_group_name_1,index_group_name_2,index_group_name_3,index_group_name_4,customer_id
4086,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2832662598733435800
10381,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,9697704321606698314
10382,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,7882505936672743154
10621,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,7474721472941889727
10622,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,9167082501483725962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22974363,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,16393473652392728316
22974607,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,16438977891077636889
22974700,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,16438977891077636889
22974704,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,16701333218391860126


In [32]:
one_hot_encoded = one_hot_encoded.groupby('customer_id').sum()

In [33]:
one_hot_encoded.head()

Unnamed: 0_level_0,graphical_appearance_name_0,graphical_appearance_name_1,graphical_appearance_name_2,graphical_appearance_name_3,graphical_appearance_name_4,graphical_appearance_name_5,graphical_appearance_name_6,graphical_appearance_name_7,graphical_appearance_name_8,graphical_appearance_name_9,...,section_name_51,section_name_52,section_name_53,section_name_54,section_name_55,index_group_name_0,index_group_name_1,index_group_name_2,index_group_name_3,index_group_name_4
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77117344919861,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
200292573348128,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
272412481300040,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
375055163245029,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
388813703204080,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [34]:
one_hot_encoded.shape

(125320, 109)

In [35]:
merged_cus = pd.merge(customers_this_fall_fabric, one_hot_encoded, on="customer_id", how="inner")
merged_cus.sort_values(by='customer_id')
merged_cus


Unnamed: 0_level_0,-1,0,1,2,3,4,5,6,7,8,...,section_name_51,section_name_52,section_name_53,section_name_54,section_name_55,index_group_name_0,index_group_name_1,index_group_name_2,index_group_name_3,index_group_name_4
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77117344919861,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
200292573348128,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
272412481300040,1,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
375055163245029,0,2,0,0,0,1,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
388813703204080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18445187566593112488,10,1,0,0,2,1,0,0,0,0,...,0,0,0,0,0,22,0,6,0,0
18445401152158227288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
18445934941549932698,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
18446384416408490004,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,2,0,0


In [36]:
merged_cus = merged_cus.reset_index()

In [37]:
merged_cus = merged_cus.drop("customer_id", axis=1)

In [38]:
merged_cus.shape

(125320, 188)

In [39]:
merged_cus.head()

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8,...,section_name_51,section_name_52,section_name_53,section_name_54,section_name_55,index_group_name_0,index_group_name_1,index_group_name_2,index_group_name_3,index_group_name_4
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
3,0,2,0,0,0,1,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [40]:
merged_cus.columns = merged_cus.columns.astype(str)

In [41]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
merged_cus = scaler.fit_transform(merged_cus)



In [42]:
matrice_sparsa_2 = csr_matrix(merged_cus)
print(matrice_sparsa_2)

  (0, 0)	0.01818181818181818
  (0, 86)	0.05555555555555555
  (0, 108)	0.017241379310344827
  (0, 134)	0.02857142857142857
  (0, 183)	0.015625
  (1, 18)	0.05555555555555555
  (1, 79)	0.017543859649122806
  (1, 108)	0.017241379310344827
  (1, 152)	0.03333333333333333
  (1, 185)	0.019230769230769232
  (2, 0)	0.01818181818181818
  (2, 1)	0.02857142857142857
  (2, 4)	0.043478260869565216
  (2, 5)	0.018518518518518517
  (2, 79)	0.017543859649122806
  (2, 80)	0.047619047619047616
  (2, 112)	0.017241379310344827
  (2, 113)	0.02857142857142857
  (2, 133)	0.09090909090909091
  (2, 136)	0.045454545454545456
  (2, 183)	0.015625
  (2, 184)	0.025
  (3, 1)	0.05714285714285714
  (3, 5)	0.018518518518518517
  (3, 79)	0.03508771929824561
  :	:
  (125317, 184)	0.025
  (125318, 1)	0.08571428571428572
  (125318, 18)	0.05555555555555555
  (125318, 79)	0.03508771929824561
  (125318, 89)	0.08333333333333333
  (125318, 91)	0.07142857142857142
  (125318, 108)	0.017241379310344827
  (125318, 110)	0.0370370370370

### Compute similarity between articles and customer style

In [43]:
similarity_matrix_sparse = cosine_similarity(matrice_sparsa_2,merged_matrix_art, dense_output=False)

In [44]:
import util
%run util.py

In [45]:
topk = get_top_K_values(similarity_matrix_sparse, 2)
print(topk)

  (0, 4986)	0.6621456996210421
  (0, 352)	0.6621456996210421
  (1, 6702)	0.3454587596787848
  (1, 216)	0.3454587596787848
  (2, 20247)	0.5120631336478781
  (2, 13366)	0.5120631336478781
  (3, 775)	0.5338297898251065
  (3, 774)	0.5338297898251065
  (4, 16433)	0.49200435219511407
  (4, 16432)	0.49200435219511407
  (5, 5136)	0.4243128558991026
  (5, 5133)	0.4243128558991026
  (6, 24442)	0.5186223065970406
  (6, 23032)	0.5186223065970406
  (7, 12985)	0.4398829862433496
  (7, 11326)	0.4398829862433496
  (8, 7796)	0.35301834156861867
  (8, 7536)	0.29835495337392276
  (9, 17601)	0.4997305074109137
  (9, 14055)	0.4997305074109137
  (10, 15849)	0.3746784606204112
  (10, 15828)	0.3746784606204112
  (11, 19770)	0.5139824489140248
  (11, 10047)	0.5139824489140248
  (12, 19727)	0.5009685567606442
  :	:
  (125307, 4703)	0.40719036588399404
  (125308, 23994)	0.3788366314038325
  (125308, 7553)	0.3788366314038325
  (125309, 7968)	0.39078886073627117
  (125309, 7958)	0.39078886073627117
  (125310, 2452

In [46]:
toprank = get_top_K_ranks(similarity_matrix_sparse, 3)
print(toprank)

  (0, 352)	1
  (0, 2270)	3
  (0, 4986)	2
  (1, 216)	1
  (1, 6701)	3
  (1, 6702)	2
  (2, 13366)	1
  (2, 18578)	3
  (2, 20247)	2
  (3, 774)	1
  (3, 775)	2
  (3, 14786)	3
  (4, 16431)	3
  (4, 16432)	2
  (4, 16433)	1
  (5, 5129)	3
  (5, 5133)	1
  (5, 5136)	2
  (6, 23032)	2
  (6, 23987)	3
  (6, 24442)	1
  (7, 10370)	3
  (7, 11326)	2
  (7, 12985)	1
  (8, 6875)	3
  :	:
  (125311, 16435)	3
  (125312, 4002)	3
  (125312, 6661)	1
  (125312, 21191)	2
  (125313, 6702)	3
  (125313, 11138)	2
  (125313, 19771)	1
  (125314, 8441)	2
  (125314, 9625)	3
  (125314, 15878)	1
  (125315, 9691)	2
  (125315, 11378)	1
  (125315, 19394)	3
  (125316, 6064)	1
  (125316, 16958)	2
  (125316, 16959)	3
  (125317, 3317)	1
  (125317, 18484)	2
  (125317, 19102)	3
  (125318, 8595)	3
  (125318, 22669)	2
  (125318, 24218)	1
  (125319, 4224)	3
  (125319, 14363)	1
  (125319, 14364)	2


In [47]:
V = toprank.data 
U = toprank.indices  # rows indexes
I = toprank.indptr  # cols indexes

print("Values (V):", V[:3])
print("Rows (U):", U[:3])
print("Columns (I):", I[:3])

Values (V): [1 3 2]
Rows (U): [ 352 2270 4986]
Columns (I): [0 3 6]


In [48]:
## TRAIN

num_elements = len(V)

neg_train = pd.DataFrame(columns=['id', 'index_1', 'index_2', 'index_3'])

# Get index of each element
for i in range(len(I) - 1):
    start_idx = I[i]
    end_idx = I[i + 1]
    
    first_element_indices = U[start_idx:end_idx]
    
    df_to_append = pd.DataFrame({'id': [i], 'index_1': [first_element_indices[0]],
                                 'index_2': [first_element_indices[1]],
                                 'index_3': [first_element_indices[2]]})
    neg_train = pd.concat([neg_train, df_to_append], ignore_index=True)
    #print(f"Primo elemento {i}: {first_element_indices}")

In [50]:
neg_train.to_parquet("../generated_data/train_test/cand_52.parquet", index = False)

In [27]:
neg_train = pd.read_parquet("../generated_data/train_test/cand_52.parquet")

In [28]:
neg_train.head()

Unnamed: 0,id,index_1,index_2,index_3
0,0,352,2270,4986
1,1,216,6701,6702
2,2,13366,18578,20247
3,3,774,775,14786
4,4,16431,16432,16433


In [29]:
neg_train["customer_id"] = index_to_cus_id["customer_id"]
neg_train.head()

Unnamed: 0,id,index_1,index_2,index_3,customer_id
0,0,352,2270,4986,2832662598733435800
1,1,216,6701,6702,9697704321606698314
2,2,13366,18578,20247,7882505936672743154
3,3,774,775,14786,7474721472941889727
4,4,16431,16432,16433,9167082501483725962


In [30]:
index_to_art_id

Unnamed: 0,article_id,original_index
0,662980001,0
1,688873001,1
2,673911002,2
3,676352001,3
4,615141003,4
...,...,...
24839,807832001,24839
24840,767579007,24840
24841,764421001,24841
24842,708394008,24842


In [31]:
neg_train['index_1_id'] = neg_train['index_1'].apply(lambda x: index_to_art_id.loc[x, 'article_id'] if x in index_to_art_id.index else None)
neg_train['index_2_id'] = neg_train['index_2'].apply(lambda x: index_to_art_id.loc[x, 'article_id'] if x in index_to_art_id.index else None)
neg_train['index_3_id'] = neg_train['index_3'].apply(lambda x: index_to_art_id.loc[x, 'article_id'] if x in index_to_art_id.index else None)

In [32]:
neg_train.head()

Unnamed: 0,id,index_1,index_2,index_3,customer_id,index_1_id,index_2_id,index_3_id
0,0,352,2270,4986,2832662598733435800,652355001,443078001,549914007
1,1,216,6701,6702,9697704321606698314,516741004,560389014,547937012
2,2,13366,18578,20247,7882505936672743154,790101003,780297002,834292001
3,3,774,775,14786,7474721472941889727,626263006,626263008,684021021
4,4,16431,16432,16433,9167082501483725962,756428001,765036001,748269009


In [33]:
neg_train_5051 = neg_train[["customer_id", 'index_1_id', 'index_2_id', 'index_3_id']]
neg_train_5051.head()

Unnamed: 0,customer_id,index_1_id,index_2_id,index_3_id
0,2832662598733435800,652355001,443078001,549914007
1,9697704321606698314,516741004,560389014,547937012
2,7882505936672743154,790101003,780297002,834292001
3,7474721472941889727,626263006,626263008,684021021
4,9167082501483725962,756428001,765036001,748269009


In [34]:
melted_df = pd.melt(neg_train_5051, id_vars=['customer_id'], var_name='index_type', value_name='article_id')
melted_df.sort_values(by=['customer_id', 'article_id'], inplace=True)

In [35]:
melted_df = melted_df.drop("index_type", axis=1)
melted_df

Unnamed: 0,customer_id,article_id
122003,77117344919861,215303002
372643,77117344919861,556528011
247323,77117344919861,620907003
223656,200292573348128,403448008
348976,200292573348128,633152006
...,...,...
178686,18446384416408490004,753061001
304006,18446384416408490004,764479003
40053,18446571879212697038,703761001
165373,18446571879212697038,715312003


In [36]:
melted_df["purchased"] = 0

In [37]:
tr_this_fall["purchased"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tr_this_fall["purchased"] = 1


In [38]:
tr_this_fall

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,product_code,prod_name,product_type_no,product_type_name,...,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,fabric_num,fabric_name,purchased
4086,2019-09-05,2832662598733435800,662980001,0.022017,2,50,662980,2110,252,2,...,1,0,15,0,1003,3,8172,"[13, 24]","[knit, wool]",1
10381,2019-09-14,9697704321606698314,688873001,0.033881,1,51,688873,131,258,5,...,1,0,11,7,1010,6,22,[-1],,1
10382,2019-09-16,7882505936672743154,688873001,0.033881,1,51,688873,131,258,5,...,1,0,11,7,1010,6,22,[-1],,1
10621,2019-09-06,7474721472941889727,673911002,0.050831,2,50,673911,11338,94,16,...,1,0,64,20,1020,7,7204,"[3, 16]","[elastic, suede]",1
10622,2019-09-10,9167082501483725962,673911002,0.050831,2,50,673911,11338,94,16,...,1,0,64,20,1020,7,7204,"[3, 16]","[elastic, suede]",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22974363,2019-09-17,16393473652392728316,807832001,0.050831,2,51,807832,30189,94,16,...,1,0,64,20,1020,7,24236,"[16, 17]","[suede, leather]",1
22974607,2019-09-17,16438977891077636889,767579007,0.016932,2,51,767579,1918,254,4,...,3,3,26,16,1002,2,1649,"[4, 0]","[cotton, jersey]",1
22974700,2019-09-17,16438977891077636889,764421001,0.016932,2,51,764421,6665,75,18,...,4,1,45,11,1019,1,8283,[13],[knit],1
22974704,2019-09-17,16701333218391860126,708394008,0.022017,2,51,708394,2828,254,4,...,2,2,53,1,1005,0,2531,[0],[jersey],1


In [39]:
articles_this_fall

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,fabric_num,fabric_name
0,108775015,108775,12855,253,9,0,1010016,0,9,0,...,0,1,0,16,30,1002,2,8834,[0],[jersey]
1,108775044,108775,12855,253,9,0,1010016,0,10,2,...,0,1,0,16,30,1002,2,8834,[0],[jersey]
3,110065001,110065,8159,306,13,4,1010016,0,9,0,...,7,1,0,61,5,1017,4,8243,[1],[microfibre]
4,110065002,110065,8159,306,13,4,1010016,0,10,2,...,7,1,0,61,5,1017,4,8243,[1],[microfibre]
5,110065011,110065,8159,306,13,4,1010016,0,12,5,...,7,1,0,61,5,1017,4,8243,[1],[microfibre]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90483,843525001,843525,25580,295,43,8,1010016,0,9,0,...,4,4,1,77,4,1005,0,22904,"[4, 0, 3]","[cotton, jersey, elastic]"
90484,843528001,843528,25583,252,2,0,1010005,8,7,4,...,4,4,1,77,4,1005,0,22905,[20],[print]
90490,843540001,843540,16535,272,0,1,1010004,7,9,0,...,4,4,1,77,4,1005,0,22910,"[23, 13, 32, 3]","[colour, knit, jacquard, elastic]"
90728,844413001,844413,24966,78,26,3,1010016,0,10,2,...,6,1,0,66,9,1019,1,23125,"[6, 9]","[hair, metal]"


In [40]:
melted_df['week'] = np.random.choice([50, 51], size=len(melted_df))
melted_df['sales_channel_id'] = np.random.choice([1, 2], size=len(melted_df))
melted_df

Unnamed: 0,customer_id,article_id,purchased,week,sales_channel_id
122003,77117344919861,215303002,0,51,1
372643,77117344919861,556528011,0,50,1
247323,77117344919861,620907003,0,51,2
223656,200292573348128,403448008,0,51,2
348976,200292573348128,633152006,0,51,1
...,...,...,...,...,...
178686,18446384416408490004,753061001,0,51,2
304006,18446384416408490004,764479003,0,51,1
40053,18446571879212697038,703761001,0,50,1
165373,18446571879212697038,715312003,0,50,2


In [41]:
avg_prices = tr_this_fall.groupby('article_id')['price'].mean().reset_index()
avg_prices

Unnamed: 0,article_id,price
0,108775015,0.008305
1,108775044,0.008058
2,110065001,0.012695
3,110065002,0.015237
4,110065011,0.022115
...,...,...
24839,843525001,0.032826
24840,843528001,0.028814
24841,843540001,0.025407
24842,844413001,0.025407


In [42]:
melted_df = pd.merge(melted_df, avg_prices, on='article_id', how='left')
melted_df

Unnamed: 0,customer_id,article_id,purchased,week,sales_channel_id,price
0,77117344919861,215303002,0,51,1,0.004559
1,77117344919861,556528011,0,50,1,0.031108
2,77117344919861,620907003,0,51,2,0.002525
3,200292573348128,403448008,0,51,2,0.011551
4,200292573348128,633152006,0,51,1,0.032627
...,...,...,...,...,...,...
375955,18446384416408490004,753061001,0,51,2,0.011979
375956,18446384416408490004,764479003,0,51,1,0.059305
375957,18446571879212697038,703761001,0,50,1,0.042113
375958,18446571879212697038,715312003,0,50,2,0.021922


In [43]:
melted_df_tot = melted_df.merge(articles, on="article_id")
melted_df_tot

Unnamed: 0,customer_id,article_id,purchased,week,sales_channel_id,price,product_code,prod_name,product_type_no,product_type_name,...,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,fabric_num,fabric_name
0,77117344919861,215303002,0,51,1,0.004559,215303,8193,81,34,...,6,1,0,66,9,1019,1,6,[5],[plastic]
1,168902247424827243,215303002,0,51,2,0.004559,215303,8193,81,34,...,6,1,0,66,9,1019,1,6,[5],[plastic]
2,592199186655949049,215303002,0,51,1,0.004559,215303,8193,81,34,...,6,1,0,66,9,1019,1,6,[5],[plastic]
3,927718735912238525,215303002,0,51,1,0.004559,215303,8193,81,34,...,6,1,0,66,9,1019,1,6,[5],[plastic]
4,1295746437716510012,215303002,0,50,2,0.004559,215303,8193,81,34,...,6,1,0,66,9,1019,1,6,[5],[plastic]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375955,18411511384974633039,759602003,0,50,1,0.047162,759602,5872,262,6,...,4,4,1,45,11,1007,9,5091,"[20, 35, 3]","[print, grosgrain, elastic]"
375956,18418372792624779902,563660002,0,50,1,0.016254,563660,20451,254,4,...,0,1,0,18,12,1005,0,21858,[0],[jersey]
375957,18421100931401586284,759581001,0,51,2,0.013542,759581,5028,265,1,...,5,4,1,40,24,1006,15,8588,"[28, 4, 3]","[yoke, cotton, elastic]"
375958,18421100931401586284,781335001,0,50,2,0.042356,781335,6229,295,43,...,5,4,1,44,2,1017,4,7287,"[4, 0]","[cotton, jersey]"


In [44]:
tr_this_fall = tr_this_fall.drop("t_dat", axis=1)

In [45]:
print(melted_df_tot.shape)
print(tr_this_fall.shape)

(375960, 32)
(510166, 32)


In [46]:
data_train = pd.concat([tr_this_fall, melted_df_tot])
data_train.purchased.fillna(0, inplace=True)

In [47]:
data_train.head()

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week,product_code,prod_name,product_type_no,product_type_name,product_group_name,...,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,fabric_num,fabric_name,purchased
4086,2832662598733435800,662980001,0.022017,2,50,662980,2110,252,2,0,...,1,0,15,0,1003,3,8172,"[13, 24]","[knit, wool]",1
10381,9697704321606698314,688873001,0.033881,1,51,688873,131,258,5,0,...,1,0,11,7,1010,6,22,[-1],,1
10382,7882505936672743154,688873001,0.033881,1,51,688873,131,258,5,0,...,1,0,11,7,1010,6,22,[-1],,1
10621,7474721472941889727,673911002,0.050831,2,50,673911,11338,94,16,5,...,1,0,64,20,1020,7,7204,"[3, 16]","[elastic, suede]",1
10622,9167082501483725962,673911002,0.050831,2,50,673911,11338,94,16,5,...,1,0,64,20,1020,7,7204,"[3, 16]","[elastic, suede]",1


In [48]:
data_train.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [49]:
print(data_train.purchased.mean())

0.5457564080721228


In [50]:
data_train = pd.merge(data_train, customers, on='customer_id', how='left')

In [52]:
y = data_train.purchased
X = data_train.drop(columns=['purchased','fabric_name', 'fabric_num']) 
X.shape, y.shape

((827589, 35), (827589,))

In [53]:
X_test_104 = pd.read_parquet("../generated_data/X_test_104.parquet")
Xf_test_104 = pd.read_parquet("../generated_data/Xf_test_104.parquet")
y_test_104 = pd.read_csv("../generated_data/y_test_104.csv")

In [54]:
y_test_104 = y_test_104['purchased'].squeeze()
y_test_104

0         1
1         1
2         1
3         1
4         1
         ..
420651    0
420652    0
420653    0
420654    0
420655    0
Name: purchased, Length: 420656, dtype: int64

### Trying some classification algorithms

In [56]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

In [57]:
from sklearn.linear_model import LogisticRegression  
logreg = LogisticRegression(random_state=42)
logreg.fit(X, y)
y_test_pred = logreg.predict(X_test_104)
print(classification_report(y_test_104, y_test_pred))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00    206928
           1       0.51      1.00      0.67    213728

    accuracy                           0.51    420656
   macro avg       0.25      0.50      0.34    420656
weighted avg       0.26      0.51      0.34    420656



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
from sklearn.ensemble import AdaBoostClassifier
ada_classifier = AdaBoostClassifier(n_estimators=25, learning_rate=0.1, random_state=42)
ada_classifier.fit(X, y)
y_test_pred = ada_classifier.predict(X_test_104)
print(classification_report(y_test_104, y_test_pred))

              precision    recall  f1-score   support

           0       0.66      0.36      0.46    206928
           1       0.57      0.82      0.67    213728

    accuracy                           0.59    420656
   macro avg       0.62      0.59      0.57    420656
weighted avg       0.62      0.59      0.57    420656



In [59]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=30, max_depth=2, random_state=42)
rf_classifier.fit(X, y)
y_test_pred = rf_classifier.predict(X_test_104)
print(classification_report(y_test_104, y_test_pred))

              precision    recall  f1-score   support

           0       0.68      0.38      0.49    206928
           1       0.58      0.83      0.68    213728

    accuracy                           0.61    420656
   macro avg       0.63      0.60      0.58    420656
weighted avg       0.63      0.61      0.59    420656



In [60]:
from sklearn.ensemble import GradientBoostingClassifier
clf_model = GradientBoostingClassifier(n_estimators=40, min_samples_split=2, random_state=42)
clf_model.fit(X, y)
clf_predictions = clf_model.predict(X_test_104)
print(classification_report(y_test_104, clf_predictions))

              precision    recall  f1-score   support

           0       0.79      0.40      0.54    206928
           1       0.61      0.90      0.72    213728

    accuracy                           0.65    420656
   macro avg       0.70      0.65      0.63    420656
weighted avg       0.70      0.65      0.63    420656



In [61]:
from xgboost import XGBClassifier
bst = XGBClassifier(n_estimators=30, max_depth=3, gamma=2, learning_rate=1, objective='binary:logistic', random_state=42)
bst.fit(X, y)
preds = bst.predict(X_test_104)
print(classification_report(y_test_104, preds))

              precision    recall  f1-score   support

           0       0.84      0.45      0.59    206928
           1       0.63      0.91      0.75    213728

    accuracy                           0.69    420656
   macro avg       0.73      0.68      0.67    420656
weighted avg       0.73      0.69      0.67    420656



### With fabric info

In [63]:
f1 = pd.read_parquet("../generated_data/articles_with_first_5_fabric.parquet")
f1 = f1[["article_id", "cotton", "elastic", "jersey", "knit", "metal"]]
f1.head()

Unnamed: 0,article_id,cotton,elastic,jersey,knit,metal
0,108775015,0,0,1,0,0
1,108775044,0,0,1,0,0
2,108775051,0,0,1,0,0
3,110065001,0,0,0,0,0
4,110065002,0,0,0,0,0


In [66]:
Xf = X.merge(f1, on="article_id")
Xf_test_104 = X_test_104.merge(f1, on="article_id")

In [67]:
from sklearn.linear_model import LogisticRegression  
logreg = LogisticRegression(random_state=42)
logreg.fit(Xf, y)
y_test_pred = logreg.predict(Xf_test_104)
print(classification_report(y_test_104, y_test_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00    206928
           1       0.51      1.00      0.67    213728

    accuracy                           0.51    420656
   macro avg       0.25      0.50      0.34    420656
weighted avg       0.26      0.51      0.34    420656



  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
from sklearn.ensemble import AdaBoostClassifier
ada_classifier = AdaBoostClassifier(n_estimators=25, learning_rate=0.1, random_state=42)
ada_classifier.fit(Xf, y)
y_test_pred = ada_classifier.predict(Xf_test_104)
print(classification_report(y_test_104, y_test_pred))

              precision    recall  f1-score   support

           0       0.57      0.91      0.70    206928
           1       0.80      0.34      0.48    213728

    accuracy                           0.62    420656
   macro avg       0.68      0.63      0.59    420656
weighted avg       0.69      0.62      0.59    420656



In [69]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=30, max_depth=2, random_state=42)
rf_classifier.fit(Xf, y)
y_test_pred = rf_classifier.predict(Xf_test_104)
print(classification_report(y_test_104, y_test_pred))

              precision    recall  f1-score   support

           0       0.57      0.91      0.70    206928
           1       0.79      0.32      0.46    213728

    accuracy                           0.61    420656
   macro avg       0.68      0.62      0.58    420656
weighted avg       0.68      0.61      0.58    420656



In [70]:
from sklearn.ensemble import GradientBoostingClassifier
clf_model = GradientBoostingClassifier(n_estimators=40, min_samples_split=2, random_state=42)
clf_model.fit(Xf, y)
clf_predictions = clf_model.predict(Xf_test_104)
print(classification_report(y_test_104, clf_predictions))

              precision    recall  f1-score   support

           0       0.58      0.93      0.72    206928
           1       0.84      0.36      0.50    213728

    accuracy                           0.64    420656
   macro avg       0.71      0.64      0.61    420656
weighted avg       0.71      0.64      0.61    420656



In [71]:
from xgboost import XGBClassifier
bst = XGBClassifier(n_estimators=30, max_depth=3, gamma=2, learning_rate=1, objective='binary:logistic', random_state=42)
bst.fit(Xf, y)
preds = bst.predict(Xf_test_104)
print(classification_report(y_test_104, preds))

              precision    recall  f1-score   support

           0       0.59      0.94      0.73    206928
           1       0.86      0.37      0.52    213728

    accuracy                           0.65    420656
   macro avg       0.73      0.65      0.62    420656
weighted avg       0.73      0.65      0.62    420656

