In [2]:
import pandas as pd
import numpy as np


num_transactions = 50_000
num_customers = 5_000
num_products = 200


customers = pd.DataFrame({
    'customer_id': range(1, num_customers + 1),
    'age': np.random.randint(18, 70, num_customers),
    'gender': np.random.choice(['Male', 'Female'], num_customers, p=[0.45, 0.55]),
    'city': np.random.choice(['Warszawa', 'Kraków', 'Wrocław', 'Poznań', 'Gdańsk'], num_customers, p=[0.3, 0.2, 0.2, 0.15, 0.15])
})


categories = ['Napoje', 'Przekąski', 'Alkohol', 'Produkty świeże', 'Nabiał', 'Pieczywo', 'Słodycze', 'Mrożonki']
products = pd.DataFrame({
    'product_id': range(1, num_products + 1),
    'category': np.random.choice(categories, num_products),
    'product_name': [f'Produkt_{i}' for i in range(1, num_products + 1)],
    'price': np.round(np.random.uniform(1.5, 30, num_products), 2)
})


transactions = pd.DataFrame({
    'transaction_id': range(1, num_transactions + 1),
    'customer_id': np.random.choice(customers['customer_id'], num_transactions),
    'product_id': np.random.choice(products['product_id'], num_transactions),
    'quantity': np.random.randint(1, 5, num_transactions),
    'purchase_date': pd.to_datetime(np.random.choice(pd.date_range('2024-01-01', '2024-12-31'), num_transactions))
})


merged_df = transactions.merge(customers, on='customer_id').merge(products, on='product_id')


In [3]:
transactions['month'] = transactions['purchase_date'].dt.month
transactions_per_month = transactions.groupby('month')['transaction_id'].count().sort_index()
transactions_per_month


month
1     4159
2     3895
3     4061
4     4146
5     4224
6     4103
7     4237
8     4372
9     4097
10    4315
11    4134
12    4257
Name: transaction_id, dtype: int64

In [4]:
merged_df['total_sales'] = merged_df['quantity'] * merged_df['price']
sales_per_category = merged_df.groupby('category')['total_sales'].sum().sort_values(ascending=False)
sales_per_category


category
Pieczywo           299175.96
Mrożonki           296469.94
Nabiał             296096.99
Alkohol            257649.75
Przekąski          235940.85
Słodycze           220999.09
Produkty świeże    193931.31
Napoje             136540.37
Name: total_sales, dtype: float64

In [5]:
avg_cart_value = merged_df.groupby('transaction_id')['total_sales'].sum().mean()
avg_cart_value


38.7360852

In [6]:
best_selling_product = merged_df.groupby('product_name')['quantity'].sum().idxmax()
best_selling_product


'Produkt_154'

In [7]:
avg_quantity_per_transaction = merged_df.groupby('transaction_id')['quantity'].sum().mean()
avg_quantity_per_transaction


2.50572

In [8]:
sales_per_city = merged_df.groupby('city')['total_sales'].sum().sort_values(ascending=False)
sales_per_city


city
Warszawa    581174.48
Wrocław     395055.93
Kraków      377702.45
Poznań      315593.01
Gdańsk      267278.39
Name: total_sales, dtype: float64

In [9]:
sales_per_gender = merged_df.groupby('gender')['total_sales'].sum()
sales_per_gender


gender
Female    1060883.36
Male       875920.90
Name: total_sales, dtype: float64

In [10]:
top_3_products = merged_df['product_name'].value_counts().head(3)
top_3_products


product_name
Produkt_154    300
Produkt_198    296
Produkt_116    289
Name: count, dtype: int64

In [11]:
avg_price_per_category = products.groupby('category')['price'].mean().sort_values(ascending=False)
avg_price_per_category


category
Słodycze           18.363500
Pieczywo           16.380000
Alkohol            15.911154
Mrożonki           15.738000
Przekąski          15.176400
Napoje             14.736667
Nabiał             14.030909
Produkty świeże    14.010455
Name: price, dtype: float64

In [12]:
merged_df['day_of_week'] = merged_df['purchase_date'].dt.day_name()
transactions_per_day = merged_df['day_of_week'].value_counts()
transactions_per_day


day_of_week
Monday       7288
Tuesday      7257
Thursday     7175
Friday       7104
Wednesday    7093
Saturday     7087
Sunday       6996
Name: count, dtype: int64