In [112]:
import pandas as pd
import numpy as np
from datetime import datetime, time

In [113]:
df = pd.read_csv('Coffe_sales.csv')

In [122]:
n = 67162

In [123]:
cash_types = ["Cash", "Credit Card", "Debit Card", "Mobile Payment"]
coffee_names = ["Espresso", "Latte", "Cappuccino", "Mocha", "Americano", "Macchiato"]
coffee_weights = [30, 25, 20, 10, 10, 5]
coffee_probs = np.array(coffee_weights) / np.sum(coffee_weights)

month_weights = [5, 1, 8, 3, 8, 12, 12, 2, 8, 5, 3, 13]
month_probs = np.array(month_weights) / np.sum(month_weights)

all_dates = pd.date_range(start="2023-01-01", end="2025-12-31", freq="D")
weights_by_month = {i+1: w for i, w in enumerate(month_weights)}
date_weights = np.array([weights_by_month[d.month] for d in all_dates])
date_probs = date_weights / date_weights.sum()

selected_dates = np.random.choice(all_dates, size=n, p=date_probs)

# Gerar horários aleatórios
times = [time(hour=np.random.randint(7, 22)) for _ in range(n)]

In [128]:
synthetic_data = pd.DataFrame({
    "hour_of_day": [t.hour for t in times],
    "cash_type": np.random.choice(cash_types, n, p=[0.15, 0.45, 0.30, 0.10]),
    "money": np.round(np.random.uniform(2.5, 15.0, n), 2),
    "coffee_name": np.random.choice(coffee_names, n, p=coffee_probs),
    "Time_of_Day": ["Morning" if 5 <= t.hour < 12 else
                    "Afternoon" if 12 <= t.hour < 18 else
                    "Night"
                    for t in times],
    "Weekday": pd.Series(selected_dates).dt.day_name(),
    "Month_name": pd.Series(selected_dates).dt.month_name(),
    "Weekdaysort": pd.Series(selected_dates).dt.weekday,
    "Monthsort": pd.Series(selected_dates).dt.month,
    "Date": pd.Series(selected_dates).dt.date,
    "Time": times,
    "week_number": pd.Series(selected_dates).dt.isocalendar().week,
    "day_of_month": pd.Series(selected_dates).dt.day,
    "City": np.random.choice(['Sydney', 'Melbourne'], n, p=[0.53, 0.47])  # Nova coluna adicionada
})

In [129]:
print(synthetic_data)

       hour_of_day       cash_type  money coffee_name Time_of_Day    Weekday  \
0               10      Debit Card   3.61  Cappuccino     Morning  Wednesday   
1               13      Debit Card   6.37       Mocha   Afternoon    Tuesday   
2               18            Cash  13.48    Espresso       Night     Friday   
3               14     Credit Card   9.37   Macchiato   Afternoon  Wednesday   
4                8      Debit Card   9.49       Latte     Morning   Thursday   
...            ...             ...    ...         ...         ...        ...   
67157           12  Mobile Payment  12.29       Mocha   Afternoon     Monday   
67158            8      Debit Card  12.14       Latte     Morning  Wednesday   
67159           12  Mobile Payment   8.50  Cappuccino   Afternoon   Saturday   
67160           14     Credit Card   5.51       Mocha   Afternoon   Thursday   
67161           20      Debit Card  10.04       Latte       Night     Friday   

      Month_name  Weekdaysort  Monthsor

In [134]:
synthetic_data.to_csv('coffe_sales.csv', index=False)

In [133]:
city_distribution = synthetic_data['City'].value_counts()
print("Distribuição por cidade:")
print(city_distribution)
print(f"\nProporção: {city_distribution['Sydney']/n*100:.2f}% Sydney, {city_distribution['Melbourne']/n*100:.2f}% Melbourne")

Distribuição por cidade:
Sydney       35561
Melbourne    31601
Name: City, dtype: int64

Proporção: 52.95% Sydney, 47.05% Melbourne
