# Adidas Webstore Shoe Data Analysis

In [18]:
import pandas as pd
import numpy as np
import re

## Preprocessing

In [19]:
country_dim = pd.read_csv("country_dim.csv")   
shoes_dim = pd.read_csv("shoes_dim.csv")
shoes_fact = pd.read_csv("shoes_fact.csv")  

In [20]:
shoes_dim.drop_duplicates(subset="id", keep="first", inplace=True)

In [21]:
shoes_fact = pd.merge(shoes_fact, shoes_dim, on="id")

In [22]:
shoes_fact = pd.merge(shoes_fact, country_dim, on="country_code")

In [23]:
shoes_fact.rename(columns={"Unnamed: 0": "serial_no"}, inplace=True)

In [24]:
shoes_fact.drop(columns={"image_url"}, inplace=True)

In [25]:
category_mapping = {
    "us/soccer-shoes": "football-shoes",
    "us/walking-shoes": "walking-shoes",
    "us/athletic_sneakers": "athletic_sneakers",
    "us/running-shoes": "running-shoes",
    "us/workout-shoes": "gym_training-shoes",
    "us/hiking-shoes": "hiking-shoes",
    "us/tennis-shoes": "tennis-shoes",
    "occer-shoes": "football-shoes"
}

shoes_fact["category"] = shoes_fact["category"].replace(category_mapping)

In [26]:
shoes_fact["size"] = shoes_fact["size"].str.replace(r"(\d+)\s*1/3", r"\1.33", regex=True)
shoes_fact["size"] = shoes_fact["size"].str.replace(r"(\d+)\s*2/3", r"\1.66", regex=True)
shoes_fact["size"] = shoes_fact["size"].str.replace(r"(\d+)\s*1/2", r"\1.5", regex=True)

In [27]:
mask = shoes_fact["date"].str.contains("-", na=False)
shoes_fact.loc[mask, "date"] = shoes_fact.loc[mask, "date"].str.split("-").str[::-1].str.join("/")

In [28]:
shoes_fact

Unnamed: 0,serial_no,id,price,category,size,availability,date,country_code,name,best_for_wear,gender,dominant_color,sub_color1,sub_color2,currency,shoe_metric
0,63575,HP9426,60.0,sneakers,36,0,07/01/2025,DE,Breaknet 2.0 Schuh,City,U,Cloud White,Core Black,Cloud White,euro,eu
1,63576,HP9426,60.0,sneakers,36.66,0,07/01/2025,DE,Breaknet 2.0 Schuh,City,U,Cloud White,Core Black,Cloud White,euro,eu
2,63577,HP9426,60.0,sneakers,37.33,0,07/01/2025,DE,Breaknet 2.0 Schuh,City,U,Cloud White,Core Black,Cloud White,euro,eu
3,63578,HP9426,60.0,sneakers,38,0,07/01/2025,DE,Breaknet 2.0 Schuh,City,U,Cloud White,Core Black,Cloud White,euro,eu
4,63579,HP9426,60.0,sneakers,38.66,1,07/01/2025,DE,Breaknet 2.0 Schuh,City,U,Cloud White,Core Black,Cloud White,euro,eu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299151,847278,JI4476,160.0,sneakers,47.33,3,16/01/2025,BE,Adizero Boston 12 Laufschuh,Racing,U,Core White,Dark Blue,Pure Ruby,euro,eu
299152,847279,JI4476,160.0,sneakers,48,0,16/01/2025,BE,Adizero Boston 12 Laufschuh,Racing,U,Core White,Dark Blue,Pure Ruby,euro,eu
299153,847280,JI4476,160.0,sneakers,48.66,0,16/01/2025,BE,Adizero Boston 12 Laufschuh,Racing,U,Core White,Dark Blue,Pure Ruby,euro,eu
299154,847281,JI4476,160.0,sneakers,49.33,2,16/01/2025,BE,Adizero Boston 12 Laufschuh,Racing,U,Core White,Dark Blue,Pure Ruby,euro,eu


In [29]:
shoes_fact[["price", "currency"]].value_counts()

price  currency
130.0  euro        11790
80.0   euro        11111
140.0  euro         9839
180.0  euro         9579
120.0  euro         8928
                   ...  
26.0   usd            12
21.6   euro           12
31.0   usd            12
35.4   euro           11
38.0   pounds          8
Name: count, Length: 349, dtype: int64

In [30]:
shoes_fact.dtypes

serial_no           int64
id                 object
price             float64
category           object
size               object
availability        int64
date               object
country_code       object
name               object
best_for_wear      object
gender             object
dominant_color     object
sub_color1         object
sub_color2         object
currency           object
shoe_metric        object
dtype: object

In [31]:
shoes_fact["availability"].value_counts()

availability
0     135262
15     91811
1      17135
2       8827
3       6665
4       5382
5       4741
6       4090
7       4081
8       3803
9       3660
10      3490
11      2713
12      2618
13      2507
14      2371
Name: count, dtype: int64

In [32]:
shoes_fact["country_code"].value_counts()

country_code
DE    216202
US     53368
BE     26453
UK      3133
Name: count, dtype: int64

In [33]:
shoes_fact["shoe_metric"].value_counts()

shoe_metric
eu     242655
usa     53368
uk       3133
Name: count, dtype: int64