# Exploratory Data Analysis (EDA) of the H&M dataset
## Setup
Setup the environment, import the necessary libraries and datasets.

In [None]:
#imports
import pandas as pd
import matplotlib
# directory of the dataset
data_dir = '..\\..\\Dataset\\'

# import articles.csv into a pandas dataframe
print("Importing articles dataset...")
articles = pd.read_csv(data_dir + 'articles.csv')
print("\tDone")
# import customers.csv into a pandas dataframe
print("Importing customers dataset...")
customers = pd.read_csv(data_dir + 'customers.csv')
print("\tDone")
# import transactions_train.csv into a pandas dataframe
print("Importing transactions_train dataset...")
transactions_train = pd.read_csv(data_dir + 'transactions_train.csv')
print("\tDone")

Importing articles dataset...
	Done
Importing customers dataset...
	Done
Importing transactions_train dataset...


## Articles
We will start by exploring the articles dataset

In [None]:
# number of unique article ids
num_unq_article_ids = len(articles['article_id'].unique())
# number of articles
num_articles = len(articles)
# print results
print(f'Number of unique article ids: {num_unq_article_ids} \nNumber of articles: {num_articles}')

As we can see, the number of unique article ids is equal to the number of articles. This means that there are no duplicates in the article_id column.

In [None]:
# Number of articles without a colour_group_name, where the colour_group_name is 'Unknown' without considering capitalization or where the colour_group_name is 'Undefined' without considering capitalization
num_articles_no_colour_group_name = len(articles[articles['colour_group_name'].isnull()])
num_articles_colour_group_name_unknown = len(articles[articles['colour_group_name'].str.lower() == 'unknown'])
num_articles_colour_group_name_undefined = len(articles[articles['colour_group_name'].str.lower() == 'undefined'])
# print results
print(f'Number of articles without a colour_group_name: {num_articles_no_colour_group_name}')
print(f'Number of articles where the colour_group_name is \'Unknown\': {num_articles_colour_group_name_unknown}')
print(f'Number of articles where the colour_group_name is \'Undefined\': {num_articles_colour_group_name_undefined}')
# Plot the distribution of the colour_group_name column in percentages, a title, a grid, a rotation of the labels and a bigger font size, increased spacing between the bars and labels
articles['colour_group_name'].value_counts().plot(kind='bar', figsize=(15,15), title='Distribution of the colour_group_name column', fontsize = 8, rot = 70, grid = True, width = 0.7)

In this snippet, we see that each article has a colour_group_name. However, there are some articles where the colour_group_name is 'Unknown' or 'Undefined'.
In the plot we see that the most common colour_group_name is 'Black' followed by 'Blue' and 'Grey'.
We will now explore the colour_master_name column.

In [None]:
# Number of articles without a colour_master_name
# Number of articles where the perceived_colour_master_name is 'Unknown' without considering capitalization
# Number of articles where the perceived_colour_master_name is 'Undefined' without considering capitalization
num_articles_no_perceived_colour_master_name = len(articles[articles['perceived_colour_master_name'].isnull()])
num_articles_perceived_colour_master_name_unknown = len(articles[articles['perceived_colour_master_name'].str.lower() == 'unknown'])
num_articles_perceived_colour_master_name_undefined = len(articles[articles['perceived_colour_master_name'].str.lower() == 'undefined'])

# print results
print(f'Number of articles without a perceived_colour_master_name: {num_articles_no_colour_group_name}')
print(f'Number of articles where the perceived_colour_master_name is \'Unknown\': {num_articles_perceived_colour_master_name_unknown}')
print(f'Number of articles where the perceived_colour_master_name is \'Undefined\': {num_articles_perceived_colour_master_name_undefined}')


# Plot the distribution of the perceived_colour_master_name column in a pie chart in percentages, a legend and no labels
articles['perceived_colour_master_name'].value_counts(normalize=True).plot(kind='pie', legend=True, labels=None, autopct='%1.1f%%', figsize=(20,20), title='Distribution of the perceived_colour_master_name column', )




In this snippet, we see that each article has a perceived_colour_master_name. However, there are some (=~790) articles where the perceived_colour_master_name is 'Unknown' or 'Undefined'.
In the plot we see that the most common perceived_colour_master_name is 'Black' followed by 'Blue' and 'Grey'.

In [None]:
# Number of articles without a department_name and department_no
num_articles_no_departement_name = len(articles[articles['department_name'].isnull()])
num_articles_no_departement_no = len(articles[articles['department_no'].isnull()])
minimum_departement_no = min(articles['department_no'])
# number of unique department_no and department_name
num_unq_departement_no = len(articles['department_no'].unique())
num_unq_departement_name = len(articles['department_name'].unique())

# print results
print(f'Number of articles without a department_name: {num_articles_no_departement_name}')
print(f'Number of articles without a department_no: {num_articles_no_departement_no}')
print(f'Minimum department_no: {minimum_departement_no}')
print(f'Number of unique department_no: {num_unq_departement_no}')
print(f'Number of unique department_name: {num_unq_departement_name}')

As we can see, there are no articles without a department_name or department_no. The minimum department_no is 1201.
There are 299 unique department_no and 250 unique department_name, so there is no 1 to 1 relation. As such it is probably better to use department_no as the colunmn of interest (for location).

In [None]:
# Number of articles without an index_code and index_name
num_articles_no_index_code = len(articles[articles['index_code'].isnull()])
num_articles_no_index_name = len(articles[articles['index_name'].isnull()])
# print results
print(f'Number of articles without an index_code: {num_articles_no_index_code}')
print(f'Number of articles without an index_name: {num_articles_no_index_name}')
# Plot the distribution of the index_name column in percentages
articles['index_name'].value_counts(normalize=True).plot(kind='bar', figsize=(15,15), title='Distribution of the index_name column', rot = 70, grid = True, width = 0.7, fontsize=12)

In this snippet, we see that each article has an index_name. Nothiing unexpected here.

In [None]:
# Nunber of articles without a detail_desc
num_articles_no_detail_desc = len(articles[articles['detail_desc'].isnull()])
# print results
print(f'Number of articles without a detail_desc: {num_articles_no_detail_desc}')
# Print articles that do not have a description
print(articles[articles['detail_desc'].isnull()])

We can see that some (~416) articles do not have a description.
## Customers
We will now explore the customers dataset. We will start by exploring the customer_id column.

In [None]:
# number of unique customer ids vs number of customers
num_unq_customer_ids = len(customers['customer_id'].unique())
num_customers = len(customers)
# print results
print(f'Number of unique customer ids: {num_unq_customer_ids} \nNumber of customers: {num_customers}')



As we can see, the number of unique customer ids is equal to the number of customers. This means that there are no duplicates in the customer_id column and each row has a unique id.


In [None]:
# find the max and min of the FN column, not counting Nan values
max_FN = customers['FN'].max()
min_FN = customers['FN'].min()
# number of Fn values that are Nan
num_FN_Nan = customers['FN'].isnull().sum()
# number of fn values that are not nan
num_FN_not_Nan = len(customers) - num_FN_Nan

# print results
print(f'Maximum FM: {max_FN} \nMinimum FM: {min_FN}')
print(f'Number of FN values that are Nan: {num_FN_Nan}')
print(f'Number of FN values that are not Nan: {num_FN_not_Nan}')


We can see that about a third of the FN values are Nan. While the other rows have a FN value 1. Other than that it is not clear what the FN column represents.

In [None]:
# Find the max and min of the active column, not counting Nan values
max_active = customers['Active'].max()
min_active = customers['Active'].min()
# number of active values that are Nan
num_active_Nan = customers['Active'].isnull().sum()
# number of active values that are not nan
num_active_not_Nan = len(customers) - num_active_Nan

# print results
print(f'Maximum active: {max_active} \nMinimum active: {min_active}')
print(f'Number of active values that are Nan: {num_active_Nan}')
print(f'Number of active values that are not Nan: {num_active_not_Nan}')


We can see that about a third of the active values are Nan. While the other rows have an active value 1. Other than that it is not clear what the active column represents. We will now look how those two relate to each other.

In [None]:
# select the rows where FN is Nan
no_fn = customers[customers['FN'].isnull()]
# select the rows where FN is not Nan
no_fn_1 = customers[customers['FN'] == 1]

# print results
print(f'Number of rows where FN is Nan: {len(no_fn)}')
print(f'Number of rows where FN is 1: {len(no_fn_1)}')
# make a pie chart of the Active column where the number of occurences is counted, also count the Nan values
fig, axes = matplotlib.pyplot.subplots(nrows=1, ncols=3)

customers['Active'].value_counts(normalize=True, dropna=False).plot(kind='pie', legend=True, labels=None, autopct='%1.1f%%', figsize=(10,10), title='Distribution of the Active column', ax=axes[0])
no_fn['Active'].value_counts(normalize=True, dropna=False).plot(kind='pie', legend=True, labels=None, autopct='%1.1f%%', figsize=(10,10), title='FN is Nan', ax=axes[1])
no_fn_1['Active'].value_counts(normalize=True, dropna=False).plot(kind='pie', legend=True, labels=None, autopct='%1.1f%%', figsize=(10,10), title='FN = 1',ax=axes[2])

We can see that every row where FN is Nan, Active is also Nan. On the other side, about 97% of the rows where FN is 1, Active is also 1. They are probably linked in some way. But their use for our project remains to be seen.



In [None]:
# Number of rows where the club_member_status is null
num_club_member_status_null = len(customers[customers['club_member_status'].isnull()])
# Number of rows where the club_member_status is not null
num_club_member_status_not_null = len(customers) - num_club_member_status_null
# print results
print(f'Number of rows where the club_member_status is null: {num_club_member_status_null}')
print(f'Number of rows where the club_member_status is not null: {num_club_member_status_not_null}')
# Plot the distribution of the club_member_status column, logaritmic scale, including Nan values
customers['club_member_status'].value_counts(dropna=False).plot(kind='bar', figsize=(10,10), title='Distribution of the club_member_status column', rot = 70, grid = True, width = 0.7, fontsize=12, logy=True)

It seems that 6062 rows have a null value for the club_member_status column. But the majority of the customers have either an active member status or are yet to create one. Some customers have left the club.

In [None]:
# Number of rows where the age is null
num_age_null = len(customers[customers['age'].isnull()])
# min and max age
min_age = int(customers['age'].min())
max_age = int(customers['age'].max())
# round min_age to the floor 5
min_age_rounded = min_age - (min_age % 5)
# round max_age to the ceiling 5
max_age_rounded = max_age + (5 - max_age % 5)

# print results
print(f'Number of rows where the age is null: {num_age_null}')
print(f'Minimum age: {min_age}')
print(f'Maximum age: {max_age}')
# Plot the distribution of the age column, grouping the ages in bins of 5 years, label every 5 years, logaritmic scale
customers['age'].plot(kind='hist', figsize=(10,10), title='Distribution of the age column', rot = 70, grid = True, width = 3, fontsize=12, bins=range(min_age_rounded, max_age_rounded, 5), logy=True, xticks=range(min_age_rounded, max_age_rounded, 5))

It is noticable that the about 16000 customers have a null value for the age column. Other than that there are no outliers in the age column.

In [None]:
# number of customers where the fashion_news_frequency is null
num_fashion_news_frequency_null = len(customers[customers['fashion_news_frequency'].isnull()])

# print results
print(f'Number of customers where the fashion_news_frequency is null: {num_fashion_news_frequency_null}')
# Plot the distribution of the fashion_news_frequency column, including Nan values
customers['fashion_news_frequency'].value_counts(dropna=False, normalize=True).plot(kind='bar', figsize=(10,10), title='Distribution of the fashion_news_frequency column', rot = 0, grid = True, width = 0.7, fontsize=12)

In [None]:
# Look at the relation between fashion_news_frequency and club_member_status
# select the rows where the fashion_news_frequency is 'NONE'
fashion_news_frequency_null = customers[customers['fashion_news_frequency'] == str('NONE')]
# select the rows where the fashion_news_frequency is either Regularly or Monthly
the_rest = customers[(customers['fashion_news_frequency'] == str('Regularly')) | (customers['fashion_news_frequency'] == str('Monthly'))]
# print the customers who active 1 but have a fashion_news_frequency of 'NONE'
# print results
print(f'Number of customers where the fashion_news_frequency is \'NONE\': {len(fashion_news_frequency_null)}')
print(f'Number of customers where the fashion_news_frequency is \'Regularly\' or \'Monthly\': {len(the_rest)}')
# Plot a bar chart of the membership_status column for both the customers who have a fashion_news_frequency of 'NONE' and the customers who have a fashion_news_frequency of 'Regularly' or 'Monthly'
fig, axes = matplotlib.pyplot.subplots(nrows=2, ncols=2)
fig.tight_layout(pad=4.0)
fashion_news_frequency_null['club_member_status'].value_counts(normalize=True).plot(kind='bar', legend=True, figsize=(10,10), title='fashion_news_frequency is \'NONE\'', ax=axes[0,0])
the_rest['club_member_status'].value_counts(normalize=True).plot(kind='bar', legend=True, figsize=(10,10), title='fashion_news_frequency is \'Regularly\' or \'Monthly\'', ax=axes[0,1])
fashion_news_frequency_null['Active'].value_counts(normalize=True, dropna=False).plot(kind='bar', legend=True, figsize=(10,10), title='\'NONE\' news', ax=axes[1,0])
the_rest['Active'].value_counts(normalize=True, dropna=False).plot(kind='bar', legend=True, figsize=(10,10), title='\'Regularly\' or \'Monthly\' news', ax=axes[1,1])



The first two graphs do not really give us any information. However, the last two graphs show that the customers who have a fashion_news_frequency of 'NONE' are more likely to be inactive than the customers who have a fashion_news_frequency of 'Regularly' or 'Monthly'.