In [6]:
import pandas as pd
import numpy as np
from datetime import timedelta

In [7]:
def generate_fake_data(real_data: pd.DataFrame, num_days: int) -> pd.DataFrame:
    # Fixed values for traffic_rate
    TRAFFIC_RATE_VALUES = np.array([0.0, 0.1, 0.25, 0.5, 0.75, 1.0])
    
    # Ensure the 'date' column is in datetime format
    real_data["date"] = pd.to_datetime(real_data["date"])

    # Create a date range for the synthetic data
    start_date = real_data["date"].max() + timedelta(days=1)
    end_date = start_date + timedelta(days=num_days - 1)
    date_range = pd.date_range(start=start_date, end=end_date)

    # Initialize lists to hold synthetic data
    synthetic_data = {
        "date": [],
        "general_label": [],
        "traffic_rate": [],
        "total_traffic": [],
    }

    # Get the general label from the real data
    general_label = real_data["general_label"].iloc[0]

    # Calculate mean and standard deviation for traffic_rate and total_traffic
    mean_traffic_rate = real_data["traffic_rate"].mean()
    std_traffic_rate = real_data["traffic_rate"].std()
    mean_total_traffic = real_data["total_traffic"].mean()
    std_total_traffic = real_data["total_traffic"].std()

    # Generate synthetic data
    for date in date_range:
        # Generate traffic rate with some seasonal variation
        seasonal_variation = np.sin(2 * np.pi * (date.dayofyear / 365))
        
        # Generate continuous traffic rate
        continuous_rate = np.clip(
            np.random.normal(mean_traffic_rate + seasonal_variation * 0.1, std_traffic_rate * 0.5),
            0,
            1
        )
        
        # Round to nearest fixed value
        traffic_rate = TRAFFIC_RATE_VALUES[np.abs(TRAFFIC_RATE_VALUES - continuous_rate).argmin()]

        # Generate total traffic based on traffic rate
        if traffic_rate == 0.0:
            total_traffic = 0.0
        else:
            # Generate base traffic value
            base_traffic = np.clip(
                np.random.normal(mean_total_traffic * traffic_rate, std_total_traffic * 0.5),
                1000,  # Minimum value when traffic rate is not 0
                None
            )
            # Round to nearest 10000
            total_traffic = round(base_traffic / 10000) * 10000

        # Append the generated data
        synthetic_data["date"].append(date)
        synthetic_data["general_label"].append(general_label)
        synthetic_data["traffic_rate"].append(traffic_rate)
        synthetic_data["total_traffic"].append(total_traffic)

    # Create a DataFrame from the synthetic data
    synthetic_df = pd.DataFrame(synthetic_data)

    return synthetic_df

In [8]:
path = f"../Classification/output/regions/africa/genral_labeled_data_with_relative_traffic_rates/Entertainment/real_data/Kenya_with_relative_traffic_rates.csv"
df = pd.read_csv(path)

fake_data = generate_fake_data(df, 10)

print("testing generate_fake_data: ", fake_data)

testing generate_fake_data:          date  general_label  traffic_rate  total_traffic
0 2017-05-05  Entertainment          0.25              0
1 2017-05-06  Entertainment          0.10              0
2 2017-05-07  Entertainment          0.10              0
3 2017-05-08  Entertainment          0.25          10000
4 2017-05-09  Entertainment          0.10              0
5 2017-05-10  Entertainment          0.25              0
6 2017-05-11  Entertainment          0.10          10000
7 2017-05-12  Entertainment          0.10              0
8 2017-05-13  Entertainment          0.10          10000
9 2017-05-14  Entertainment          0.10              0
