<a href="https://colab.research.google.com/github/LEROYNORONHA/RetailAnalysis/blob/main/Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
import os
from google.colab import drive
import requests
import csv
import random
from datetime import datetime, timedelta
import pandas as pd

In [None]:
# Check if drive is already mounted
if not os.path.ismount('/content/drive'):
    drive.mount('/content/drive')

In [7]:
df = pd.read_csv('amazon_products.csv', low_memory=False)

In [None]:
df.drop(columns=['listPrice', 'isBestSeller', 'imgUrl', 'productURL', 'boughtInLastMonth'], inplace=True)

df.rename(columns={
    'asin': 'Product_ID',
    'title': 'Product_Desc',
    'stars': 'Product_Rating',
    'reviews': 'Product_Reviews',
    'price': 'Product_Price',
    'category_id': 'Category_ID'
}, inplace=True)

In [None]:
df = df[df['Product_Price'] != 0]

In [None]:
df.to_csv('products.csv', index=False)

In [None]:
df = pd.read_csv('amazon_categories.csv', low_memory=False)

In [None]:
df.rename(columns={
    'id': 'Category_ID',
    'category_name': 'Category_Desc'
}, inplace=True)

In [None]:
df.to_csv('categories.csv', index=False)

In [None]:
# Fake domain lists
fake_domains = [
    'gmail.com', 'yahoo.com', 'hotmail.com', 'aol.com', 'protonmail.com',
    'mail.com', 'icloud.com', 'outlook.com', 'live.com', 'zoho.com',
    'gmx.com', 'fastmail.com', 'tutanota.com', 'yandex.com', 'hushmail.com',
    'inbox.com', 'email.com', 'startmail.com', 'mailfence.com', 'runbox.com',
    'mail.ru', 'web.de', 'laposte.net', 'bigpond.com', 'netzero.net',
    'rediffmail.com', 'rocketmail.com', 'msn.com', 'me.com', 'usa.com',
    'optusnet.com.au', 'btinternet.com', 'shaw.ca', 'verizon.net', 'trashmail.com',
    'tempmail.com', '10minutemail.com', 'tiscali.co.uk', 'orange.fr', 'sympatico.ca',
    'juno.com', 'bellsouth.net', 'freemail.hu', 'netcourrier.com', 'telus.net',
    'uk2.net', 'cox.net', 'earthlink.net', 'safe-mail.net', 'mail2world.com'
]

companies = [
    'techcorp', 'globex', 'dynalabs', 'futurebiz', 'infinisoft',
    'skyforge', 'quantix', 'zenbyte', 'nexora', 'coretech',
    'infranix', 'codevio', 'bytecraft', 'datapulse', 'metadash',
    'infocrest', 'xentrix', 'verivue', 'tekspire', 'cyberflux',
    'novalink', 'bluepixel', 'graygate', 'bitbridge', 'hypercore',
    'synpulse', 'netspire', 'mindwave', 'aetherium', 'lumidyn',
    'zenova', 'orbitex', 'sparkline', 'avionyx', 'axonify',
    'dexatek', 'uplinx', 'corevise', 'brightleaf', 'intellisys',
    'miraplex', 'infogenix', 'cortexon', 'cybernova', 'stackbright',
    'pathwave', 'aegistron', 'voxelworks', 'intellivue', 'alphaqubit'
]

country_tlds = [
    'com', 'co.uk', 'com.au', 'ca', 'co.in', 'co.nz', 'de', 'fr', 'it', 'es',
    'nl', 'se', 'no', 'fi', 'pl', 'be', 'ch', 'at', 'pt', 'ie',
    'cz', 'sk', 'ru', 'ua', 'ro', 'bg', 'gr', 'dk', 'hu', 'lt',
    'lv', 'ee', 'tr', 'hk', 'sg', 'my', 'ph', 'th', 'vn', 'id',
    'kr', 'jp', 'cn', 'za', 'ng', 'br', 'ar', 'mx', 'cl', 'pe'
]

# Email domain customizer
def custom_domain(original_email, mode='mixed'):
    username = original_email.split('@')[0]
    if mode == 'free':
        domain = random.choice(fake_domains)
    elif mode == 'company':
        domain = f"{random.choice(companies)}.com"
    elif mode == 'country':
        domain = f"{random.choice(companies)}.{random.choice(country_tlds)}"
    else:
        domain_type = random.choice(['free', 'company', 'country'])
        return custom_domain(original_email, mode=domain_type)
    return f"{username}@{domain}"

# Fetch & create fake customers
def get_customers(num_customers=10):
    url = f'https://randomuser.me/api/?results={num_customers}&nat=us'
    response = requests.get(url)
    data = response.json()['results']
    customers = []
    count = 1
    for person in data:
        first_name = person['name']['first']
        last_name = person['name']['last']
        dob = datetime.strptime(person['dob']['date'], '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%m/%d/%Y')
        original_email = person['email']
        email = custom_domain(original_email, mode='mixed')
        gender = person['gender'].capitalize()
        address = f"{person['location']['street']['number']} {person['location']['street']['name']}"
        city = person['location']['city']
        state = person['location']['state']
        postcode = person['location']['postcode']
        customer_type = random.choice(['Regular', 'Premium'])

        customers.append([
            first_name, last_name, gender, address,
            city, state, postcode, email, customer_type, dob
        ])
        count = count + 1
    return customers

# Generate 98,765 customer rows (5000 x 19 + 3765)
all_customers = []
for _ in range(19):
    all_customers.extend(get_customers(5000))
all_customers.extend(get_customers(3765))

In [48]:
# Youngest Customer
pd.to_datetime(df['Birthday (mm/dd/yyyy)'], format='%m/%d/%Y').max()

Timestamp('2001-05-28 00:00:00')

In [47]:
# Oldest Customer
pd.to_datetime(df['Birthday (mm/dd/yyyy)'], format='%m/%d/%Y').min()

Timestamp('1944-08-25 00:00:00')

In [63]:
# Define the start and end dates
start_date = datetime.strptime('01/01/2012', '%m/%d/%Y')
end_date = datetime.strptime('12/31/2022', '%m/%d/%Y')

# Function to generate a random date in 2012
def random_date(start, end):
    delta = end - start
    random_days = random.randint(0, delta.days)
    return (start + timedelta(days=random_days)).strftime('%m/%d/%Y')

# Append a random registration date to each customer
for customer in all_customers:
    reg_date = random_date(start_date, end_date)
    customer.append(reg_date)

In [64]:
columns = [
    'First_Name', 'Last_Name', 'DOB', 'Email', 'Gender', 'Address',
    'City', 'State', 'Postcode', 'Customer_Type', 'Registration_Date'
]
# Convert your list of lists into a DataFrame
df = pd.DataFrame(all_customers, columns=columns)

# Remove duplicates based on the Email column
df.drop_duplicates(subset='Email', keep='first', inplace=True)

# Save to CSV
df.to_csv('customers.csv', index=False)