In [9]:
import pandas as pd
import numpy as np
from datetime import datetime, time

In [10]:
df = pd.read_csv('Coffee_sales.csv')

In [11]:
n = 67162

In [35]:
cash_types = ["Cash", "Credit Card", "Debit Card", "Mobile Payment"]
coffee_names = ["Espresso", "Latte", "Cappuccino", "Mocha", "Americano", "Macchiato"]
city = ["Sidney", "Melbourne"]
coffee_weights = [28, 24, 18, 12, 11, 7]  # Pesos mais balanceados
coffee_probs = np.array(coffee_weights) / np.sum(coffee_weights)

month_weights = [6, 4, 7, 5, 9, 11, 13, 6, 8, 7, 5, 12]  # Variação mais suave
month_probs = np.array(month_weights) / np.sum(month_weights)

all_dates = pd.date_range(start="2023-01-01", end="2025-09-19", freq="D")
weights_by_month = {i+1: w for i, w in enumerate(month_weights)}
date_weights = np.array([weights_by_month[d.month] for d in all_dates])
date_probs = date_weights / date_weights.sum()

selected_dates = np.random.choice(all_dates, size=n, p=date_probs)

# CORREÇÃO: Garantir que a soma dos segmentos de horas seja exatamente igual a n
n_morning = int(n * 0.4)
n_afternoon = int(n * 0.35)
n_night = n - n_morning - n_afternoon  # Isso garante que a soma seja exatamente n

# Horários com distribuição mais realista (picos manhã/tarde)
hours = np.concatenate([
    np.random.normal(8.5, 1.2, n_morning),    # Manhã
    np.random.normal(14.5, 1.5, n_afternoon), # Tarde
    np.random.normal(19.0, 1.8, n_night)      # Noite
])
hours = np.clip(np.round(hours), 7, 22).astype(int)

selected_dates_pd = pd.to_datetime(selected_dates)

# Determinar a estação do ano para cada data
seasons = []
for date in selected_dates_pd:
    month = date.month
    if month in [12, 1, 2]:  # Verão no hemisfério sul
        seasons.append('summer')
    elif month in [3, 4, 5]:  # Outono
        seasons.append('autumn')
    elif month in [6, 7, 8]:  # Inverno
        seasons.append('winter')
    else:  # Primavera
        seasons.append('spring')

# Obter o dia da semana (0=segunda, 6=domingo)
weekdays = selected_dates_pd.weekday

# Ajustar a geração de horários com base na cidade e dia da semana
hours = np.zeros(n, dtype=int)
for i in range(n):
    city = cities[i]
    weekday = weekdays[i]
    
    if city == 'Sydney':
        # Sydney: pico por volta das 8h, mas ajustado por dia da semana
        if weekday == 0:  # Segunda-feira - mais fraco
            if np.random.random() < 0.35:  # Manhã reduzida
                hour = int(np.random.normal(8.0, 1.2))
            elif np.random.random() < 0.7:  # Tarde reduzida
                hour = int(np.random.normal(14.5, 1.5))
            else:  # Noite
                hour = int(np.random.normal(19.0, 1.8))
        elif weekday == 4:  # Sexta-feira - fraco especialmente de manhã
            if np.random.random() < 0.3:  # Manhã muito reduzida
                hour = int(np.random.normal(8.0, 1.2))
            elif np.random.random() < 0.65:  # Tarde
                hour = int(np.random.normal(14.5, 1.5))
            else:  # Noite
                hour = int(np.random.normal(19.0, 1.8))
        else:
            if np.random.random() < 0.4:  # Manhã
                hour = int(np.random.normal(8.0, 1.2))
            elif np.random.random() < 0.75:  # Tarde
                hour = int(np.random.normal(14.5, 1.5))
            else:  # Noite
                hour = int(np.random.normal(19.0, 1.8))
    else:  # Melbourne
        # Melbourne: pico por volta das 9h30, mas ajustado por dia da semana
        if weekday == 0:  # Segunda-feira - mais fraco
            if np.random.random() < 0.3:  # Manhã reduzida
                hour = int(np.random.normal(9.5, 1.2))
            elif np.random.random() < 0.6:  # Tarde reduzida
                hour = int(np.random.normal(14.5, 1.5))
            else:  # Noite
                hour = int(np.random.normal(19.0, 1.8))
        elif weekday == 4:  # Sexta-feira - fraco de manhã, mas melhora à tarde/noite
            if np.random.random() < 0.25:  # Manhã muito reduzida
                hour = int(np.random.normal(9.5, 1.2))
            elif np.random.random() < 0.6:  # Tarde normal
                hour = int(np.random.normal(14.5, 1.5))
            else:  # Noite aumentada
                hour = int(np.random.normal(19.0, 1.8))
        else:
            if np.random.random() < 0.4:  # Manhã
                hour = int(np.random.normal(9.5, 1.2))
            elif np.random.random() < 0.75:  # Tarde
                hour = int(np.random.normal(14.5, 1.5))
            else:  # Noite
                hour = int(np.random.normal(19.0, 1.8))
    
    # Garantir que o horário esteja dentro do intervalo válido (7-22)
    hours[i] = np.clip(hour, 7, 22)

# Ajustar preços com base na pesquisa
money_values = []
for i in range(n):
    city = cities[i]
    season = seasons[i]
    weekday = weekdays[i]
    hour = hours[i]
    
    # Preço base por cidade
    if city == 'Sydney':
        base_price = 14.50
    else:  # Melbourne
        base_price = 15.15
    
    # Ajustes sazonais
    if city == 'Sydney' and season == 'winter':
        price = base_price * 1.08  # +8% no inverno em Sydney
    elif city == 'Melbourne' and season == 'summer':
        price = base_price * 1.07  # +7% no verão em Melbourne
    elif season == 'autumn':
        price = base_price * 0.98  # -2% no outono
    else:
        price = base_price
    
    # Ajustes por dia da semana
    if weekday == 0:  # Segunda-feira
        price *= 0.95  # -5% nas segundas-feiras
    elif weekday == 4:  # Sexta-feira
        if city == 'Melbourne' and hour >= 16:  # Sexta à tarde/noite em Melbourne
            price *= 1.05  # +5% devido ao movimento de fim de semana
        else:
            price *= 0.97  # -3% nas sextas-feiras em geral
    
    # Adicionar variação aleatória
    price *= np.random.uniform(0.95, 1.05)
    
    money_values.append(round(price, 2))

In [36]:
synthetic_data = pd.DataFrame({
    "hour_of_day": hours,
    "cash_type": np.random.choice(cash_types, n, p=[0.18, 0.42, 0.32, 0.08]),
    "money": money_values,
    "coffee_name": np.random.choice(coffee_names, n, p=coffee_probs),
    "Time_of_Day": ["Morning" if 5 <= h < 12 else
                    "Afternoon" if 12 <= h < 18 else
                    "Night" for h in hours],
    "Weekday": selected_dates_pd.day_name(),
    "Month_name": selected_dates_pd.month_name(),
    "Weekdaysort": selected_dates_pd.weekday,
    "Monthsort": selected_dates_pd.month,
    "Date": selected_dates_pd.date,
    "Time": [time(hour=int(h)) for h in hours],
    "week_number": selected_dates_pd.isocalendar().week,
    "day_of_month": selected_dates_pd.day,
    "City": cities
})

In [37]:
print(synthetic_data)

            hour_of_day    cash_type  money coffee_name Time_of_Day  \
2024-10-08           16  Credit Card  14.11  Cappuccino   Afternoon   
2023-12-22           10         Cash  15.54       Latte     Morning   
2025-03-17            7   Debit Card  13.47    Espresso     Morning   
2024-06-07           20  Credit Card  14.50   Americano       Night   
2025-01-05           17  Credit Card  16.10   Macchiato   Afternoon   
...                 ...          ...    ...         ...         ...   
2024-10-28           14  Credit Card  13.28  Cappuccino   Afternoon   
2024-03-10            7   Debit Card  13.90    Espresso     Morning   
2024-12-25            8   Debit Card  14.81       Latte     Morning   
2024-06-18            9  Credit Card  15.03   Americano     Morning   
2023-09-15            9  Credit Card  13.96  Cappuccino     Morning   

              Weekday Month_name  Weekdaysort  Monthsort        Date  \
2024-10-08    Tuesday    October            1         10  2024-10-08   
202

In [39]:
synthetic_data.to_csv('coffee_sales.csv', index=False)

In [38]:
city_distribution = synthetic_data['City'].value_counts()
print("Distribuição por cidade:")
print(city_distribution)
print(f"\nProporção: {city_distribution['Sydney']/n*100:.2f}% Sydney, {city_distribution['Melbourne']/n*100:.2f}% Melbourne")

Distribuição por cidade:
Melbourne    35064
Sydney       32098
Name: City, dtype: int64

Proporção: 47.79% Sydney, 52.21% Melbourne
