In [1]:
#lets import the packages we need.
import random
from datetime import datetime
import csv

#lets create  the name of the products
product_names = ['Aqua', 'Belle', 'Ciel', 'Dusk', 'Elixir']

#creating the values for the social media on products
social_media_sentiments = ['Positive', 'Neutral', 'Negative']

#items for our product category
product_category = ['eau de cologne', 'eau de toilette', 'eau de parfum']

#the people the products are made for.
user_types = ['Men', 'Women', 'Unisex']

#create a dictionary with cities and countries of the perfume offices
cities = {
    'New York': 'USA',
    'London': 'UK',
    'Paris': 'France',
    'Dubai': 'UAE',
    'Tokyo': 'Japan',
    'Sydney': 'Australia'
}

#We want to create a dictionary with different age ranges for our  customers and their probability of occurence in the dataset.
age_ranges = {
    '18-23': 0.05,
    '24-45': 0.6,
    '46-65': 0.35
}

#We want to create a dictionary with different bottle sizes for the products and their probability of occurence in the dataset.
bottle_sizes = {
    '30ml': 0.1,
    '50ml': 0.3,
    '75ml': 0.2,
    '100ml': 0.3,
    '200ml': 0.1
}

# create a dict with bottle sizes  as keys and the range of prices each bottle size could  have.
price_ranges = {
    '30ml': (50, 150),
    '50ml': (70, 200),
    '75ml': (100, 300),
    '100ml': (150, 400),
    '200ml': (200, 600)
}

#we want to create a real world pricing strategy. that is each key or bottle  size should be multiplied by their unique perc.
cost_price_factors = {
    '30ml': 0.6,
    '50ml': 0.7,
    '75ml': 0.8,
    '100ml': 0.9,
    '200ml': 1.2
}

#lets define a function that generates a time period plus date in string literal'f' and converts them to timestamp in the specified...
#YYYY/mm/dd
def generate_random_date(start_year, end_year):
    start_date = f"{start_year}/01/01"
    end_date = f"{end_year}/12/31"
    start_date_ts = int(datetime.timestamp(datetime.strptime(start_date, '%Y/%m/%d')))
    end_date_ts = int(datetime.timestamp(datetime.strptime(end_date, '%Y/%m/%d')))
    random_ts = random.randint(start_date_ts, end_date_ts)
    random_date = datetime.fromtimestamp(random_ts).strftime('%Y/%m/%d')
    return random_date

#we want to generate the dataset randomly based on their cum-weights or probability on the frequency at which they woulid appear.


def generate_dataset(num_rows):
    dataset = []
    for i in range(num_rows):
        product_name = random.choices(product_names, cum_weights=[2, 4, 6, 8, 10])[0]
        sentiment = random.choices(social_media_sentiments, cum_weights=[7, 8, 10])[0]
        category = random.choices(product_category, cum_weights=[8, 9, 10])[0]
        user_type = random.choices(user_types, cum_weights=[2, 7, 10])[0]
        price_range = random.choices(list(price_ranges.keys()), cum_weights=[3, 6, 8, 9, 10])[0]
        price = random.uniform(*price_ranges[price_range])
        cost_price = price * cost_price_factors[price_range]
        quantity = random.randint(1, 10)
        payment_method = random.choices(['Card', 'Cash', 'Bank Transfer'], cum_weights=[7, 9, 10])[0]
        city = random.choices(list(cities.keys()), cum_weights=[2, 4, 6, 8, 9, 10])[0]
        country = cities[city]
        bottle_size = random.choices(list(bottle_sizes.keys()), cum_weights=[1, 4, 6, 9, 10])[0]
        discount = round(random.uniform(0, 100), 2)
        purchase_date = generate_random_date(2017, 2022)
        customer_id = f"cid{random.randint(1, 50000)}"
        sales_id = f"sid{i + 1}"
        rating = random.choices([1, 2, 3, 4, 5, 10, 12], cum_weights=[7, 8, 9, 10, 11, 12, 13])[0]
        age = random.choices(list(age_ranges.keys()), cum_weights=list(age_ranges.values()))[0]
        age = 150 if random.random() < 0.01 else random.randint(int(age.split('-')[0]), int(age.split('-')[1]))
        gender = 'M' if random.random() < 0.6 else 'F'
        oil_content_perc = ''
        if category == 'eau de cologne':
            oil_content_perc = f"{random.randint(2, 4)}%"
        elif category == 'eau de toilette':
            oil_content_perc = f"{random.randint(5, 15)}%"
        else:
            oil_content_perc = f"{random.randint(16, 20)}%"

        dataset.append([
            customer_id,
            sales_id,
            product_name,
            sentiment,
            category,
            user_type,
            gender,
            age,
            price,
            cost_price,
            quantity,
            payment_method,
            city,
            country,
            bottle_size,
            discount,
            purchase_date,
            rating,
            oil_content_perc
        ])
    return dataset

dataset = generate_dataset(num_rows=200000)

#lets make the dataset to mirror a real world data by adding missing values, duplicates and incocnsistent formatting to it.
for i in range(len(dataset)):
    if i % 10 == 0:
        dataset[i][0] = ''  # Missing customer ID
    if i % 5 == 0:
        dataset[i][2] = random.choice(product_names)  # Inconsistent product name
        dataset[i][4] = random.choice(product_category)  # Inconsistent category
        dataset[i][5] = random.choice(['man', 'woman', 'uni'])  # Inconsistent user type
        dataset[i][6] = random.choice(['m', 'f'])  # Inconsistent gender
    if i % 3 == 0:
        dataset[i][7] = 150  # Age outlier
        dataset[i][8] = random.uniform(5, 1000)  # Inconsistent price
    if i % 7 == 0:
        dataset[i][10] = ''  # Missing quantity
        dataset[i][11] = random.choice(['Card', 'Cash', 'Bank Transfer', '', 'PayPal'])  # Inconsistent payment method
    if i % 11 == 0:
        dataset[i][13] = ''  # Missing country
        dataset[i][14] = random.choice(['30ml', '50ml', '75ml', '100ml', '200ml', ''])  # Inconsistent bottle size
    if i % 13 == 0:
        dataset[i][15] = ''  # Missing purchase date
        dataset[i][16] = random.choice([1, 2, 3, 4, 5, 11, 12])  # Inconsistent rating
    if i % 17 == 0:
        dataset[i][18] = ''  # Missing oil content percentage
    if i % 19 == 0:
        dataset[i][17] = random.choice(['21/05/11', '22/06/12'])  # Year inconsistencies in purchase date

header = ['Customer ID', 'Sales ID', 'Product Name', 'Sentiment', 'Category', 'User Type', 'Gender', 'Age',
          'Price', 'Cost Price', 'Quantity', 'Payment Method', 'City', 'Country', 'Bottle Size', 'Discount',
          'Purchase Date', 'Rating', 'Oil Content']

filename = 'messy_dataset.csv'
with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(dataset)

print(f"Dataset with {len(dataset)} rows generated and saved to {filename}.")


Dataset with 200000 rows generated and saved to messy_dataset.csv.
