In [1]:
from EDA import *

# Exploratory Data Analysis: Articles
#### Load the dataset

###### First we load in the articles dataset. We specify the data types for two specific columns in the dataset. This is done to ensure leading zeroes in integer values don't get dropped. We also check the first 10 rows of the dataset to get a feel for the data.

In [2]:
# Specify the data types for specific columns in articles.csv
data_types_articles = {'article_id': str, 'product_code': str, }
articles = load_dataset("datasets/articles.csv", data_types_articles)

In [3]:
# Check the first 10 rows of the articles dataset
articles.head(10)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
5,110065011,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,12,Light Beige,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
6,111565001,111565,20 den 1p Stockings,304,Underwear Tights,Socks & Tights,1010016,Solid,9,Black,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Semi shiny nylon stockings with a wide, reinfo..."
7,111565003,111565,20 den 1p Stockings,302,Socks,Socks & Tights,1010016,Solid,13,Beige,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Semi shiny nylon stockings with a wide, reinfo..."
8,111586001,111586,Shape Up 30 den 1p Tights,273,Leggings/Tights,Garment Lower body,1010016,Solid,9,Black,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Tights with built-in support to lift the botto...
9,111593001,111593,Support 40 den 1p Tights,304,Underwear Tights,Socks & Tights,1010016,Solid,9,Black,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Semi shiny tights that shape the tummy, thighs..."


#### Verification of the (individual) columns

In [4]:
# check missing values of each column
print(f"Missing values per column:\n{articles.isna().sum()}")

Missing values per column:
article_id                        0
product_code                      0
prod_name                         0
product_type_no                   0
product_type_name                 0
product_group_name                0
graphical_appearance_no           0
graphical_appearance_name         0
colour_group_code                 0
colour_group_name                 0
perceived_colour_value_id         0
perceived_colour_value_name       0
perceived_colour_master_id        0
perceived_colour_master_name      0
department_no                     0
department_name                   0
index_code                        0
index_name                        0
index_group_no                    0
index_group_name                  0
section_no                        0
section_name                      0
garment_group_no                  0
garment_group_name                0
detail_desc                     416
dtype: int64


##### Verify the article_id column
###### We check the article_id column for missing values, unique values and whether each value has the same format or length. Nothing unusual is found.

In [5]:
# Check for missing values, unique values and format of article_id column
verify_article_ids(articles)

There are 0 missing article ids.
article ids contain unique ids: True
All article ids have the same format: True


##### Verify the product_code column
###### We check the product_code column for missing values and whether each value has the same format or length. Nothing unusual is found. Duplicates are not a problem, as the product_code column is not a unique identifier for each row.

In [6]:
# Check for missing values, unique values and format of product_code column
verify_product_code(articles)

There are 0 missing product codes.
All product codes have the same format: True


##### Verify the product_name column
###### We check the product_name column for missing values and value counts. Nothing unusual is found.

In [7]:
# Check for missing values and check value counts of prod_name column
verify_prod_name(articles)

There are 0 missing product names.
Product name value counts:
prod_name
Dragonfly dress                98
Mike tee                       72
Wow printed tee 6.99           70
1pk Fun                        55
TP Paddington Sweater          54
                               ..
W MARCIE DRESS CNY              1
W NAPOLI SKIRT CNY              1
BEANIE JERSEY FLEECED LINED     1
H-string multicolour            1
Lounge dress                    1
Name: count, Length: 45875, dtype: int64


##### Verify the product_type_no column
###### We check the product_type_no column for missing values and value counts. Nothing unusual is found.

In [8]:
# Check for missing values and check value counts of product_type_no column
verify_product_type_no(articles)

There are 0 missing product type numbers.
Product type number value counts:
product_type_no
272    11169
265    10362
252     9302
255     7904
254     4155
       ...  
525        1
514        1
351        1
349        1
483        1
Name: count, Length: 132, dtype: int64


##### Verify the product_type_name column

In [9]:
verify_product_type_name(articles)


Product type names are unique:False
There are 0 missing product type names.
Product type name value counts:
product_type_name
Trousers         11169
Dress            10362
Sweater           9302
T-shirt           7904
Top               4155
                 ...  
Keychain             1
Headband             1
Cushion              1
Blanket              1
Clothing mist        1
Name: count, Length: 131, dtype: int64


##### Verify the product_group_name column

In [10]:
verify_colour_group_name(articles)


There are 0 missing colour group names.
Colour group name value counts:
colour_group_name
Black              22670
Dark Blue          12171
White               9542
Light Pink          5811
Grey                4487
Light Beige         3356
Blue                3308
Red                 3056
Light Blue          3012
Greenish Khaki      2767
Dark Grey           2731
Off White           2726
Beige               2712
Dark Red            2340
Dark Green          2106
Light Grey          2105
Pink                2063
Yellow              1645
Light Orange        1520
Yellowish Brown     1471
Gold                1377
Dark Beige          1084
Light Turquoise     1027
Light Yellow         984
Dark Orange          886
Dark Pink            818
Green                815
Orange               779
Other Pink           750
Silver               709
Light Green          681
Dark Yellow          574
Light Purple         553
Dark Turquoise       473
Turquoise            435
Dark Purple          315
Light Red 

##### Verify the graphical_appearance_name column

In [11]:
verify_graphical_appearance_name(articles)


There are 0 missing graphical appearance names.
Graphical appearance name value counts:
graphical_appearance_name
Solid                  49747
All over pattern       17165
Melange                 5938
Stripe                  4990
Denim                   4842
Front print             3215
Placement print         3098
Check                   2178
Colour blocking         1830
Lace                    1513
Other structure         1502
Application/3D          1341
Embroidery              1165
Mixed solid/pattern     1132
Glittering/Metallic      958
Jacquard                 830
Sequin                   806
Dot                      681
Treatment                586
Other pattern            515
Contrast                 376
Metallic                 346
Chambray                 322
Slub                     153
Transparent               86
Mesh                      86
Neps                      66
Unknown                   52
Argyle                    15
Hologram                   8
Name: count, dty

##### Check for correlations between certain columns

In [12]:
check_correlation_between_columns_articles(articles)


Each product_code is uniquely mapped to a product_type_no: False
Each product_type_no is uniquely mapped to a product_type_name: False
Non unique product_type_name values:
      product_type_no product_type_name  count
22                83          Umbrella     26
129              532          Umbrella      3


# Exploratory Data Analysis: Customers

In [13]:
data_types_transactions = {
    'article_id': str,
}
# customers_df = load_dataset("datasets/customers.csv")
# transactions_train_df = load_dataset("datasets/transactions_train.csv", data_types_transactions)