In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from faker import Faker
import random

fake = Faker()
np.random.seed(42)

products = ['Sensor A', 'Sensor B', ' Sensor C ', 'module x', 'MODULE Y']
regions = ['Berlin', 'berlin', 'MUNICH', 'Warsaw', ' Krakow ']
warehouses = ['Leipzig', 'Poznan', 'Munich']
shipping_methods = ['Standard', 'Express', ' Economy']
statuses = ['Delivered', 'Late', ' Cancelled']

data = []

for i in range(250):
    order_id = f"ORD{1000 + i}"
    product = random.choice(products)
    region = random.choice(regions)
    order_date = fake.date_between(start_date='-6M', end_date='today')
    ship_delay = np.random.randint(2, 12)
    ship_date = order_date + pd.Timedelta(days=ship_delay)
    
    # Randomly remove some values
    delivery_status = np.random.choice(statuses + [None], p=[0.7, 0.15, 0.1, 0.05])
    revenue = round(np.random.uniform(50, 2000), 2) if np.random.rand() > 0.05 else None
    inventory = np.random.randint(-5, 120) if np.random.rand() > 0.03 else ''
    warehouse = random.choice(warehouses)
    shipping_method = random.choice(shipping_methods)

    # Introduce a duplicate every ~30 rows
    if i % 30 == 0:
        data.append(data[-1] if data else {})

    data.append({
        'Order ID': order_id,
        'Product': product,
        'Customer Region': region,
        'Order Date': order_date.strftime('%d/%m/%Y'),  # messy date format
        'Ship Date': ship_date.strftime('%Y-%m-%d'),     # another date format
        'Delivery Status': delivery_status,
        'Revenue (€)': f"€{revenue}" if revenue is not None else '',
        'Inventory Level': inventory,
        'Warehouse': warehouse,
        'Shipping Method': shipping_method
    })

df = pd.DataFrame(data)
df.to_csv("raw_sales_logistics_data.csv", index=False)

In [2]:
df.head()

Unnamed: 0,Order ID,Product,Customer Region,Order Date,Ship Date,Delivery Status,Revenue (€),Inventory Level,Warehouse,Shipping Method
0,,,,,,,,,,
1,ORD1000,Sensor B,Berlin,15/03/2025,2025-03-23,Late,€1570.4,116.0,Leipzig,Express
2,ORD1001,Sensor C,MUNICH,08/03/2025,2025-03-12,Delivered,€700.73,-3.0,Leipzig,Express
3,ORD1002,Sensor A,MUNICH,06/04/2025,2025-04-13,Delivered,€1880.18,,Poznan,Economy
4,ORD1003,Sensor B,Warsaw,14/06/2025,2025-06-20,Delivered,€63.78,,Poznan,Express


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259 entries, 0 to 258
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Order ID         258 non-null    object
 1   Product          258 non-null    object
 2   Customer Region  258 non-null    object
 3   Order Date       258 non-null    object
 4   Ship Date        258 non-null    object
 5   Delivery Status  249 non-null    object
 6   Revenue (€)      258 non-null    object
 7   Inventory Level  258 non-null    object
 8   Warehouse        258 non-null    object
 9   Shipping Method  258 non-null    object
dtypes: object(10)
memory usage: 20.4+ KB
