In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from datetime import datetime, timedelta

In [3]:

np.random.seed(42)
num_users = 10000

# Generowanie danych
user_ids = [f"{i:05d}" for i in range(1, num_users + 1)]

# Daty rejestracji: losowo od 2022-01-01 do 2024-12-31
signup_dates = pd.to_datetime(np.random.choice(
    pd.date_range(start="2022-01-01", end="2024-12-31"), num_users))

# Ostatnia aktywność: między datą rejestracji a dziś
last_active_dates = [signup + timedelta(days=int(np.random.randint(30, 800))) 
                     for signup in signup_dates]
last_active_dates = [min(date, pd.Timestamp("2025-03-30")) for date in last_active_dates]

# Całkowita liczba miesięcy z opłatą
total_months_paid = [max(1, int((end - start).days // 30)) 
                     for start, end in zip(signup_dates, last_active_dates)]

# Aktywni klienci - losowo, ale większa szansa jeśli ostatnia aktywność jest blisko dziś
is_active = [1 if (datetime(2025, 3, 30) - date).days < np.random.randint(90, 300) else 0 
             for date in last_active_dates]

# Urządzenia i kraje
device_types = np.random.choice(['mobile', 'desktop', 'tablet'], num_users, p=[0.6, 0.3, 0.1])
countries = np.random.choice(['Poland', 'Germany', 'France', 'USA', 'UK'], num_users, 
                             p=[0.4, 0.2, 0.15, 0.15, 0.1])

# Średni czas ćwiczeń dziennie
avg_minutes_per_day = np.round(np.random.normal(loc=30, scale=10, size=num_users), 1)
avg_minutes_per_day = np.clip(avg_minutes_per_day, 5, 90)

# Wiek
ages = np.random.normal(loc=35, scale=10, size=num_users).astype(int)
ages = np.clip(ages, 18, 70)


df = pd.DataFrame({
    'user_id': user_ids,
    'signup_date': signup_dates,
    'last_active_date': last_active_dates,
    'total_months_paid': total_months_paid,
    'is_active': is_active,
    'device_type': device_types,
    'country': countries,
    'avg_minutes_per_day': avg_minutes_per_day,
    'age': ages
})



### Analiza Retencji Klientów w Subskrypcji Online

In [4]:
df.head()

Unnamed: 0,user_id,signup_date,last_active_date,total_months_paid,is_active,device_type,country,avg_minutes_per_day,age
0,1,2024-05-10,2025-03-30,10,1,mobile,Germany,44.7,35
1,2,2024-12-31,2025-03-30,2,1,desktop,Poland,28.5,45
2,3,2024-11-10,2025-03-30,4,1,mobile,USA,27.8,50
3,4,2022-05-02,2022-08-06,3,0,mobile,Poland,30.7,33
4,5,2023-04-12,2024-09-15,17,0,desktop,Poland,16.6,43
