In [6]:
import pandas as pd
import numpy as np
import random
from faker import Faker

# Initialize Faker
fake = Faker()

# Constants
num_customers = 689
num_products = 20
num_orders = 1255
categories = ['Electronics', 'Furniture', 'Clothing', 'Toys', 'Books']
states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware",
          "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana",
          "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana",
          "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina",
          "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota",
          "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"]
cities = {state: [fake.city() for _ in range(5)] for state in states}

# Generate data
data = []

order_ids = random.sample(range(1, num_orders + 1), num_orders)  # Unique order IDs

for order_id in order_ids:
    customer_id = random.randint(1, num_customers)
    product_id = random.randint(1, num_products)
    state = random.choice(states)
    city = random.choice(cities[state])
    date = fake.date_between_dates(date_start=pd.to_datetime("2024-01-01"), date_end=pd.to_datetime("2024-12-31"))
    category = random.choice(categories)
    sales = round(random.uniform(100.0, 1000.0), 2)
    quantity_sold = random.randint(1, 10)
    
    data.append([customer_id, product_id, order_id, city, state, date, category, sales, quantity_sold])

# Create DataFrame
columns = ["customer_id", "product_id", "order_id", "city", "state", "date", "product_category", "sales", "quantity_sold"]
df = pd.DataFrame(data, columns=columns)

# Display the DataFrame
print(df.head())

# Save to CSV
df.to_csv('orders_dataset.csv', index=False)

   customer_id  product_id  order_id           city         state        date  \
0           81           3        35    Natashafurt      Virginia  2024-05-31   
1          165          17       738      East Joan  North Dakota  2024-06-20   
2          459           5        94  New Kellyport      Michigan  2024-04-26   
3           41          13       667    Jasminestad        Kansas  2024-06-29   
4          581          16       910     Walshburgh   Connecticut  2024-04-23   

  product_category   sales  quantity_sold  
0        Furniture  826.73              3  
1            Books  937.20              6  
2         Clothing  661.12              7  
3            Books  646.19              3  
4        Furniture  789.69              5  


In [10]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)

# Constants
num_customers = 346
num_orders = 923
num_products = 20
num_categories = 5

# Generate customer IDs
customer_ids = [f'C{str(i).zfill(3)}' for i in range(1, num_customers + 1)]

# Generate product IDs and categories
product_ids = [f'P{str(i).zfill(3)}' for i in range(1, num_products + 1)]
categories = ['Electronics', 'Furniture', 'Clothing', 'Toys', 'Books']

# Create a mapping of products to categories
product_category_map = {product_id: random.choice(categories) for product_id in product_ids}

# Generate states (sample data)
states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware",
          "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana",
          "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana",
          "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina",
          "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota",
          "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"]

# Generate cities (sample data)
cities = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"]

# Generate unique order IDs
order_ids = [f'O{str(i).zfill(3)}' for i in range(1, num_orders + 1)]

# Generate order data
data = []
for i in range(num_orders):
    customer_id = random.choice(customer_ids)
    product_id = random.choice(product_ids)
    order_id = order_ids[i]
    city = random.choice(cities)
    state = random.choice(states)
    date = np.random.choice(pd.date_range(start='2023-01-01', end='2023-12-31'))
    category = product_category_map[product_id]
    sales = round(random.uniform(100, 1000), 2)
    quantity_sold = random.randint(1, 10)
    
    data.append([customer_id, product_id, order_id, city, state, date, category, sales, quantity_sold])

# Create DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'product_id', 'order_id', 'City', 'State', 'Date', 'Product_Category', 'Sales', 'Quantity_Sold'])

# Ensure the Date column is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Display the first few rows of the DataFrame
print(df.head())

# Save to a CSV file
df.to_csv('orders_dataset.csv', index=False)

  customer_id product_id order_id      City     State       Date  \
0        C280       P014     O001  San Jose     Texas 2023-04-13   
1        C298       P013     O002  San Jose  Oklahoma 2023-12-15   
2        C196       P019     O003   Phoenix   Vermont 2023-09-28   
3        C168       P011     O004  San Jose  Kentucky 2023-04-17   
4        C057       P004     O005  San Jose   Florida 2023-03-13   

  Product_Category   Sales  Quantity_Sold  
0        Furniture  584.56              4  
1      Electronics  496.31              9  
2             Toys  903.27              2  
3             Toys  981.01              1  
4            Books  750.14              3  
