# Exploration of Transaction dataset
This notebook contains the first data understanding performed on the transaction dataset, as required for the first assignment

## Import of libraries and dataset

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import dataset and create a copy to work with
df = pd.read_csv("../data/transactions_train.csv")
df_tran = df.copy()

In [3]:
#visualization of the first rows
df_tran.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [12]:
df_customers = pd.read_csv("../data/customers.csv")

In [13]:
df_customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,,,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,,,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...


## General analysis

In [4]:
df_tran.shape

(31788324, 5)

In [5]:
df_tran.isna().any() #no missing values!

t_dat               False
customer_id         False
article_id          False
price               False
sales_channel_id    False
dtype: bool

In [6]:
df_tran.describe()

Unnamed: 0,article_id,price,sales_channel_id
count,31788320.0,31788320.0,31788320.0
mean,696227200.0,0.02782927,1.704028
std,133448000.0,0.01918113,0.4564786
min,108775000.0,1.694915e-05,1.0
25%,632803000.0,0.01581356,1.0
50%,714582000.0,0.02540678,2.0
75%,786524000.0,0.03388136,2.0
max,956217000.0,0.5915254,2.0


## Dealing with specific columns

### Sales channel id / name
1 = store and 2 = online

In [7]:
#create a new column to match sales channel id and name
df_tran["sales_channel_name"] = df_tran["sales_channel_id"].apply(lambda x: "Store" if x == 1 else "Online")

In [8]:
df_tran

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,sales_channel_name
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,Online
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,Online
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,Online
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,Online
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,Online
...,...,...,...,...,...,...
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2,Online
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2,Online
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1,Store
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1,Store


 ### Customer Id

In [9]:
len(df_tran["customer_id"][0])

64

In [10]:
# i'll check if they have a common structure
same_length = True

for id in df_tran["customer_id"]:
  if len(id) != len(df_tran["customer_id"][0]):
    same_length = False

if same_length == True:
  print(f"Every id has the same length, equal to {len(id)}")
else:
  print(f"Some ids have a different length")

#they all have the same length
#I dont' think there's a pattern

Every id has the same length, equal to 64


In [14]:
tran_cust = df_tran["customer_id"].unique() in df_customers["customer_id"].unique()
print(f"it is {tran_cust} that every customer recorded in transaction is also recorded in customer")

it is False that every customer recorded in transaction is also recorded in customer


  tran_cust = df_tran["customer_id"].unique() in df_customers["customer_id"].unique()


In [15]:
df_custom = df_tran.groupby("customer_id")
duplicate_customers = df_custom.filter(lambda x: len(x) > 1)
print(f"There are {len(duplicate_customers)} customer who have bought more than once")

There are 31656810 customer who have bought more than once


In [16]:
df_article = df_tran.groupby("article_id")
duplicate_article = df_article.filter(lambda x: len(x) > 1)
print(f"There are {len(duplicate_article)} articles which were bought more than once")

There are 31783833 articles which were bought more than once


### Price

In [17]:
df_tran["price"].max()

0.5915254237288136

In [18]:
df_tran["price"].min()

1.694915254237288e-05

In [None]:
# maybe we should opproximate to 2 or 3 decimals

### Data T_dat

In [19]:
df_tran["t_dat"].min()

'2018-09-20'

In [20]:
df_tran["t_dat"].max()

'2020-09-22'

In [None]:
# maybe we should convert into data type date