In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
np.random.seed(420)
Countries = ["PL", "GER", "UK", "USA", "CZ", "IT"]
Product_Category = ["Electronics", "Clothing", "Food"]
data = {
    "Customer_ID" : np.arange(100000),
    "Country" : np.random.choice(Countries, 100000),
    "Product_Category" : np.random.choice(Product_Category, 100000),
    "Total_Spend" : np.random.randint(low=50, high=25000, size=(100000)),
    "Month": np.random.randint(1,13, 100000),
    "Age": np.random.randint(low=18, high=80, size=(100000)),
}
df = pd.DataFrame(data)
#data validation by increasing amount spent based on country's wealth
spend_distribution = {
    "USA": (500, 25000),
    "GER": (400, 20000),
    "UK": (300, 15000),
    "PL": (100, 8000),
    "CZ": (50, 6000),
    "IT": (200, 12000)
}
df["Total_Spend"] = df["Country"].apply(lambda x: np.random.randint(*spend_distribution[x]))
df['Customer_ID'] = df['Country'] + (df.groupby('Country').cumcount() + 1).astype(str).str.zfill(3)
df.head()

Unnamed: 0,Customer_ID,Country,Product_Category,Total_Spend,Month,Age
0,GER001,GER,Food,18008,3,29
1,IT001,IT,Clothing,10087,10,78
2,UK001,UK,Electronics,12820,9,37
3,UK002,UK,Electronics,1355,2,49
4,IT002,IT,Electronics,579,1,31


In [3]:
def get_segment(amount):
    if amount < 2000:
        return "Low Spender"
    elif amount >= 2000 and amount <= 10000:
        return "Medium Spender"
    else:
        return "High Spender"

df["Segment"] = df["Total_Spend"].apply(get_segment)
df.head(1)

Unnamed: 0,Customer_ID,Country,Product_Category,Total_Spend,Month,Age,Segment
0,GER001,GER,Food,18008,3,29,High Spender


In [4]:
#Która grupa klientów (Segment) generuje największy przychód?

spenders_revenue = df.groupby("Segment")["Total_Spend"].sum().sort_values(ascending=False)
print(f"Top spender's segment: {spenders_revenue.idxmax()} with revenue of {spenders_revenue.max()}")

Top spender's segment: High Spender with revenue of 408619747


In [5]:
#Jaka jest średnia wartość zamówienia (Total_Spend) w poszczególnych krajach?
spend_per_country = df.groupby("Country")["Total_Spend"].mean().sort_values(ascending=False)

for country, value in spend_per_country.items():
    print(f"In {country} avg total spend is: {value}")

In USA avg total spend is: 12715.1772694782
In GER avg total spend is: 10173.27413034304
In UK avg total spend is: 7683.5834582858515
In IT avg total spend is: 6128.495861324376
In PL avg total spend is: 4054.75
In CZ avg total spend is: 3038.790057915058


In [6]:
#Który Kraj i kategoria produktów generują największe zyski?
print(f"Country with biggest sales: {spend_per_country.idxmax()}")
biggest_rev_product = df.groupby("Product_Category")["Total_Spend"].sum().sort_values(ascending=False).idxmax()
print(f"Product category with biggest revenue: {biggest_rev_product}")


Country with biggest sales: USA
Product category with biggest revenue: Food


In [7]:
monthly_sales = df.groupby("Month")["Total_Spend"].sum().sort_values(ascending=False)
diff_monthly = monthly_sales.max() - monthly_sales.min()
print(f"Difference between top and lowest revenue in a month: {diff_monthly}")
pct_lowest_highest = round(monthly_sales.min() / monthly_sales.max() * 100,2)
print(f"Lowest month's revenue is: {pct_lowest_highest}% of the revenue of the best's month revenue.")
print("So, it can be said that the months are fairly equal")

Difference between top and lowest revenue in a month: 2873052
Lowest month's revenue is: 95.43% of the revenue of the best's month revenue.
So, it can be said that the months are fairly equal


In [25]:
unqiues_ages = sorted(df["Age"].unique())
youth = unqiues_ages[:len(unqiues_ages)//2]
elders = unqiues_ages[len(unqiues_ages)//2:]



31

In [27]:
def get_age(age):
    if age <= 48:
        return "Youth"
    else:
        return "Elder"
    
df["Generation"] = df["Age"].apply(get_age)

In [29]:
age_comparison_sales = df.groupby("Generation")["Total_Spend"].sum().sort_values(ascending=False)
print(age_comparison_sales)
print('Clearly, older half of the spenders - "Elders" - spend more money on shopping.')

Generation
Elder    367659053
Youth    362931639
Name: Total_Spend, dtype: int64

In [32]:
sales_by_age = round(df.groupby("Age")["Total_Spend"].mean(),2)
for age, spend in sales_by_age.items():
    print(f"On avarage, people in age of {age} spent {spend} money.")

On avarage, people in age of 18 spent 7157.6 money.
On avarage, people in age of 19 spent 7349.5 money.
On avarage, people in age of 20 spent 7063.15 money.
On avarage, people in age of 21 spent 7144.48 money.
On avarage, people in age of 22 spent 7156.44 money.
On avarage, people in age of 23 spent 7272.3 money.
On avarage, people in age of 24 spent 7650.04 money.
On avarage, people in age of 25 spent 7300.7 money.
On avarage, people in age of 26 spent 7240.49 money.
On avarage, people in age of 27 spent 7523.14 money.
On avarage, people in age of 28 spent 7254.45 money.
On avarage, people in age of 29 spent 7444.89 money.
On avarage, people in age of 30 spent 7321.21 money.
On avarage, people in age of 31 spent 7157.05 money.
On avarage, people in age of 32 spent 7475.53 money.
On avarage, people in age of 33 spent 7358.78 money.
On avarage, people in age of 34 spent 7287.04 money.
On avarage, people in age of 35 spent 7166.14 money.
On avarage, people in age of 36 spent 7332.55 mone