In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
articles = pd.read_csv('./data/articles.csv')
customers = pd.read_csv('./data/customers.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')
transactions_train = pd.read_csv('./data/transactions_train.csv')

## Articles

In [7]:
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [24]:
articles.columns

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc',
       'product_code_check'],
      dtype='object')

In [20]:
print(f"There are {len(articles)} articles of which {len(articles['article_id'].unique())} are unique.")

There are 105542 articles of which 105542 are unique.


In [23]:
articles.isna().sum()

article_id                        0
product_code                      0
prod_name                         0
product_type_no                   0
product_type_name                 0
product_group_name                0
graphical_appearance_no           0
graphical_appearance_name         0
colour_group_code                 0
colour_group_name                 0
perceived_colour_value_id         0
perceived_colour_value_name       0
perceived_colour_master_id        0
perceived_colour_master_name      0
department_no                     0
department_name                   0
index_code                        0
index_name                        0
index_group_no                    0
index_group_name                  0
section_no                        0
section_name                      0
garment_group_no                  0
garment_group_name                0
detail_desc                     416
product_code_check                0
dtype: int64

In [12]:
articles["article_id"].describe()

count    1.055420e+05
mean     6.984246e+08
std      1.284624e+08
min      1.087750e+08
25%      6.169925e+08
50%      7.022130e+08
75%      7.967030e+08
max      9.594610e+08
Name: article_id, dtype: float64

In [15]:
article_min = articles["article_id"].min()
article_max = articles["article_id"].max()

print(f"Article ids from {article_min} to {article_max} with lengths {len(str(article_min))} to {len(str(article_max))}")

Article ids from 108775015 to 959461001 with lengths 9 to 9


In [16]:
articles["product_code"].describe()

count    105542.000000
mean     698424.563378
std      128462.384432
min      108775.000000
25%      616992.500000
50%      702213.000000
75%      796703.000000
max      959461.000000
Name: product_code, dtype: float64

In [22]:
# Check if the beginning of the article id matches the product code for each product
articles["product_code_check"] = articles['article_id'] // 1000 == articles['product_code']
articles["product_code_check"].describe()


count     105542
unique         1
top         True
freq      105542
Name: product_code_check, dtype: object

In [26]:
articles.nunique()

article_id                      105542
product_code                     47224
prod_name                        45875
product_type_no                    132
product_type_name                  131
product_group_name                  19
graphical_appearance_no             30
graphical_appearance_name           30
colour_group_code                   50
colour_group_name                   50
perceived_colour_value_id            8
perceived_colour_value_name          8
perceived_colour_master_id          20
perceived_colour_master_name        20
department_no                      299
department_name                    250
index_code                          10
index_name                          10
index_group_no                       5
index_group_name                     5
section_no                          57
section_name                        56
garment_group_no                    21
garment_group_name                  21
detail_desc                      43404
product_code_check       

In [32]:
products = articles[["product_code", "prod_name"]].drop_duplicates()
print(products[products.duplicated(subset="product_code", keep=False)])

        product_code                     prod_name
0             108775                     Strap top
2             108775                 Strap top (1)
46            146706      Tanktop body white 3PACK
48            146706               3P TANKTOP BODY
70            156610                  Connor pants
...              ...                           ...
105150        933729  HEDDA conscious training cap
105231        935196       PS STAR SILK MIX TIGHTS
105232        935196       PQ STAR SILK MIX TIGHTS
105335        937627            PS CAT CASH BEANIE
105336        937627            PQ CAT CASH BEANIE

[4764 rows x 2 columns]


In [35]:
product_types = articles[["product_type_no", "product_type_name"]].drop_duplicates()
print(product_types[product_types.duplicated(subset="product_type_no", keep=False)])

product_groups = articles[["product_type_no", "product_group_name"]].drop_duplicates()
print(product_groups[product_groups.duplicated(subset="product_type_no", keep=False)])

     product_type_no product_type_name
36               532          Umbrella
719               83          Umbrella
Empty DataFrame
Columns: [product_type_no, product_group_name]
Index: []


In [36]:
print(product_types[product_types.duplicated(subset="product_type_name", keep=False)])


     product_type_no product_type_name
36               532          Umbrella
719               83          Umbrella


In [46]:
def find_duplicate_pairwise_matches(numcol, namecol):
    df = articles[[numcol, namecol]].drop_duplicates()
    return df[df.duplicated(subset=namecol, keep=False)]    

In [47]:
duplicate_graphical_appearance = find_duplicate_pairwise_matches("graphical_appearance_no", "graphical_appearance_name")
duplicate_colour_group = find_duplicate_pairwise_matches("colour_group_code", "colour_group_name")
duplicate_perceived_colour_value = find_duplicate_pairwise_matches("perceived_colour_value_id", "perceived_colour_value_name")
duplicate_perceived_colour_master = find_duplicate_pairwise_matches("perceived_colour_master_id", "perceived_colour_master_name")
duplicate_department = find_duplicate_pairwise_matches("department_no", "department_name")
duplicate_index = find_duplicate_pairwise_matches("index_code", "index_name")
duplicate_index_group = find_duplicate_pairwise_matches("index_group_no", "index_group_name")
duplicate_section = find_duplicate_pairwise_matches("section_no", "section_name")
duplicate_garment_group = find_duplicate_pairwise_matches("garment_group_no", "garment_group_name") 

for duplicates in [duplicate_graphical_appearance, duplicate_colour_group, duplicate_perceived_colour_value, duplicate_perceived_colour_master, duplicate_department, duplicate_index, duplicate_index_group, duplicate_section, duplicate_garment_group]:
    print(f"{duplicates.columns[0]} has {len(duplicates)} duplicates")

graphical_appearance_no has 0 duplicates
colour_group_code has 0 duplicates
perceived_colour_value_id has 0 duplicates
perceived_colour_master_id has 0 duplicates
department_no has 71 duplicates
index_code has 0 duplicates
index_group_no has 0 duplicates
section_no has 2 duplicates
garment_group_no has 0 duplicates


In [56]:
duplicate_department = duplicate_department.sort_values(by="department_name")
print(duplicate_department)

       department_no department_name
5475            3510     Accessories
26948           3941     Accessories
9286            7530     Accessories
381             9985     Accessories
438             3209            Bags
...              ...             ...
4344            1710         Trouser
322             5656         Trouser
526             1722         Trouser
481             2031   Woven bottoms
859             1939   Woven bottoms

[71 rows x 2 columns]


In [59]:
print(duplicate_section)

       section_no  section_name
38956           4  Ladies Other
43598          17  Ladies Other


## Customers

In [8]:
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


## Sample Submission

In [9]:
sample_submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...


## Transactions 

In [10]:
transactions_train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


## Images