In [2]:
import csv
import random
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# Helper function to generate random data
def random_data(prefix, n):
    random_number = random.randint(1, n)
    return f"{prefix}{random_number}"

# Function to generate the CSV data
def generate_csv_data(num_rows, file_name):
    # Generate random strings for each column using the generalized function
    exit1 = [random_data("Exit", 1000) for _ in range(num_rows)]
    port = [random_data("Port", 1000) for _ in range(num_rows)]
    user_group = [random_data("UserGroup", 1000) for _ in range(num_rows)]
    country = [random_data("Country", 1000) for _ in range(num_rows)]
    member_id = [random.randint(1, 10000) for _ in range(num_rows)]

    # Generate dates over a 10-year period
    start_date = datetime.strptime('2000-01-01', '%Y-%m-%d')
    dates = [start_date + timedelta(days=random.randint(0, 365*10)) for _ in range(num_rows)]
    
    # Generate random numeric data
    gain_amount_one = np.random.randint(0, 10000, size=num_rows)
    gain_amount_two = np.random.randint(0, 10000, size=num_rows)
    loss_amount = np.random.randint(0, 1000, size=num_rows)
    total_amount = gain_amount_one + gain_amount_two - loss_amount

    # Prepare the header
    header = [
        'Exit', 
        'Port', 
        'User Group',
        'Country',
        'Member ID', 
        'Date', 
        'Gain Amount One', 
        'Gain Amount Two', 
        'Loss Amount', 
        'Total Amount'
        ]
    
    # Prepare the data
    data = [
        [
            exit1[i], 
            port[i],
            user_group[i],
            country[i],
            member_id[i], 
            dates[i].strftime('%Y-%m-%d'),
            gain_amount_one[i],
            gain_amount_two[i],
            loss_amount[i],
            total_amount[i]
        ]
        for i in range(num_rows)
    ]

    # Sort data by 'Exit' and then by 'Date'
    data.sort(key=lambda x: (x[0], x[5]))  # x[0] is 'Exit', x[5] is 'Date'

    # Write the data to CSV without quotes around strings
    with open(f'{file_name}.csv', 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=';')
        csvwriter.writerow(header)
        csvwriter.writerows(data)
    
    print(f"CSV file with {num_rows} rows generated!")

# Function to generate the XLSX data
def generate_xlsx_data(num_rows, file_name):
    # Generate random strings for 'Outlet' using the generalized function
    exit2 = [random_data("Exit", 1000) for _ in range(num_rows)]

    # Generate dates over a 10-year period
    start_date = datetime.strptime('01.01.2000.', '%d.%m.%Y.')
    periods = sorted(start_date + timedelta(days=random.randint(0, 365*10)) for _ in range(num_rows))
    periods = [date.strftime('%d.%m.%Y.') for date in periods]

    # Generate random datetime for 'Time' (two weeks after start_date in the same year)
    times = []
    for period in periods:
        start_date_for_times = datetime.strptime(period, '%d.%m.%Y.')
        time_base = start_date_for_times + timedelta(days=14)  # Ensure times are 14 days after start_date
        time_base = time_base.replace(hour=23, minute=5, second=11)  # Set the fixed time of day
        time_base += timedelta(seconds=random.randint(0, 86400))  # Add random seconds
        times.append(time_base.strftime('%d.%m.%Y. %H:%M:%S'))

    # Generate random numeric data
    gain_amount_three = np.random.uniform(100, 10000, size=num_rows).astype(int)

    # Create the dataframe
    df_xlsx = pd.DataFrame({
        'Period': periods,
        'Exit': exit2,
        'Gain Amount Three': gain_amount_three,
        'Time': times
    })

    # Save to XLSX with header on the third row
    with pd.ExcelWriter(f'{file_name}.xlsx', engine='xlsxwriter') as writer:
        df_xlsx.to_excel(writer, index=False, header=True, startrow=2)
    
    print(f"XLSX file with {num_rows} rows generated!")

generate_csv_data(100000, "Mock_Data_100k_rows")
generate_xlsx_data(100000, "Mock_Data_100k_rows")

generate_csv_data(1000000, "Mock_Data_1M_rows")
generate_xlsx_data(1000000, "Mock_Data_1M_rows")

print("Files have been generated successfully!")

CSV file with 100000 rows generated!
XLSX file with 100000 rows generated!
CSV file with 1000000 rows generated!
XLSX file with 1000000 rows generated!
Files have been generated successfully!
