<a href="https://colab.research.google.com/github/LEROYNORONHA/RetailAnalysis/blob/main/Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from google.colab import drive
import requests
import csv
import random
from datetime import datetime, timedelta
import pandas as pd

In [4]:
# Check if drive is already mounted
if not os.path.ismount('/content/drive'):
    drive.mount('/content/drive')

In [None]:
df = pd.read_csv('amazon_products.csv', low_memory=False)

In [None]:
df.drop(columns=['listPrice', 'isBestSeller', 'imgUrl', 'productURL', 'boughtInLastMonth'], inplace=True)

df.rename(columns={
    'asin': 'Product_ID',
    'title': 'Product_Desc',
    'stars': 'Product_Rating',
    'reviews': 'Product_Reviews',
    'price': 'Product_Price',
    'category_id': 'Category_ID'
}, inplace=True)

In [None]:
df = df[df['Product_Price'] != 0]

In [None]:
df.to_csv('products.csv', index=False)

In [None]:
df = pd.read_csv('amazon_categories.csv', low_memory=False)

In [None]:
df.rename(columns={
    'id': 'Category_ID',
    'category_name': 'Category_Desc'
}, inplace=True)

In [None]:
df.to_csv('categories.csv', index=False)

In [None]:
# Fake domain lists
fake_domains = [
    'gmail.com', 'yahoo.com', 'hotmail.com', 'aol.com', 'protonmail.com',
    'mail.com', 'icloud.com', 'outlook.com', 'live.com', 'zoho.com',
    'gmx.com', 'fastmail.com', 'tutanota.com', 'yandex.com', 'hushmail.com',
    'inbox.com', 'email.com', 'startmail.com', 'mailfence.com', 'runbox.com',
    'mail.ru', 'web.de', 'laposte.net', 'bigpond.com', 'netzero.net',
    'rediffmail.com', 'rocketmail.com', 'msn.com', 'me.com', 'usa.com',
    'optusnet.com.au', 'btinternet.com', 'shaw.ca', 'verizon.net', 'trashmail.com',
    'tempmail.com', '10minutemail.com', 'tiscali.co.uk', 'orange.fr', 'sympatico.ca',
    'juno.com', 'bellsouth.net', 'freemail.hu', 'netcourrier.com', 'telus.net',
    'uk2.net', 'cox.net', 'earthlink.net', 'safe-mail.net', 'mail2world.com'
]

companies = [
    'techcorp', 'globex', 'dynalabs', 'futurebiz', 'infinisoft',
    'skyforge', 'quantix', 'zenbyte', 'nexora', 'coretech',
    'infranix', 'codevio', 'bytecraft', 'datapulse', 'metadash',
    'infocrest', 'xentrix', 'verivue', 'tekspire', 'cyberflux',
    'novalink', 'bluepixel', 'graygate', 'bitbridge', 'hypercore',
    'synpulse', 'netspire', 'mindwave', 'aetherium', 'lumidyn',
    'zenova', 'orbitex', 'sparkline', 'avionyx', 'axonify',
    'dexatek', 'uplinx', 'corevise', 'brightleaf', 'intellisys',
    'miraplex', 'infogenix', 'cortexon', 'cybernova', 'stackbright',
    'pathwave', 'aegistron', 'voxelworks', 'intellivue', 'alphaqubit'
]

country_tlds = [
    'com', 'co.uk', 'com.au', 'ca', 'co.in', 'co.nz', 'de', 'fr', 'it', 'es',
    'nl', 'se', 'no', 'fi', 'pl', 'be', 'ch', 'at', 'pt', 'ie',
    'cz', 'sk', 'ru', 'ua', 'ro', 'bg', 'gr', 'dk', 'hu', 'lt',
    'lv', 'ee', 'tr', 'hk', 'sg', 'my', 'ph', 'th', 'vn', 'id',
    'kr', 'jp', 'cn', 'za', 'ng', 'br', 'ar', 'mx', 'cl', 'pe'
]

# Email domain customizer
def custom_domain(original_email, mode='mixed'):
    username = original_email.split('@')[0]
    if mode == 'free':
        domain = random.choice(fake_domains)
    elif mode == 'company':
        domain = f"{random.choice(companies)}.com"
    elif mode == 'country':
        domain = f"{random.choice(companies)}.{random.choice(country_tlds)}"
    else:
        domain_type = random.choice(['free', 'company', 'country'])
        return custom_domain(original_email, mode=domain_type)
    return f"{username}@{domain}"

# Fetch & create fake customers
def get_customers(num_customers=10):
    url = f'https://randomuser.me/api/?results={num_customers}&nat=us'
    response = requests.get(url)
    print(response)
    data = response.json()['results']
    customers = []
    for person in data:
        first_name = person['name']['first']
        last_name = person['name']['last']
        dob = datetime.strptime(person['dob']['date'], '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%m/%d/%Y')
        original_email = person['email']
        email = custom_domain(original_email, mode='mixed')
        gender = person['gender'].capitalize()
        state = person['location']['state']
        customer_type = random.choice(['Regular', 'Premium'])

        customers.append([
            first_name, last_name, dob, email, gender, state, customer_type
        ])
    return customers

# Generate 98,765 customer rows (5000 x 19 + 3765)
all_customers = []
for _ in range(19):
    all_customers.extend(get_customers(5000))
all_customers.extend(get_customers(3765))

In [None]:
# Youngest Customer
pd.to_datetime(df['Birthday (mm/dd/yyyy)'], format='%m/%d/%Y').max()

In [None]:
# Oldest Customer
pd.to_datetime(df['Birthday (mm/dd/yyyy)'], format='%m/%d/%Y').min()

In [None]:
# Define the start and end dates
start_date = datetime.strptime('01/01/2012', '%m/%d/%Y')
end_date = datetime.strptime('12/31/2022', '%m/%d/%Y')

# Function to generate a random date in 2012
def random_date(start, end):
    delta = end - start
    random_days = random.randint(0, delta.days)
    return (start + timedelta(days=random_days)).strftime('%m/%d/%Y')

# Append a random registration date to each customer
for customer in all_customers:
    reg_date = random_date(start_date, end_date)
    customer.append(reg_date)

In [None]:
for i, customer in enumerate(all_customers, start=1):
    customer_id = f"{i:010d}"  # Formats number as 10-digit string with leading zeros
    customer.insert(0, customer_id)

In [27]:
# all_customers = []

# with open('/content/drive/MyDrive/Colab Notebooks/customers.csv', mode='r', encoding='utf-8') as file:
#     reader = csv.reader(file)
#     next(reader)
#     for row in reader:
#         all_customers.append(row)

In [28]:
# Load State to State_ID mapping from states.csv
state_id_map = {}
with open('states.csv', mode='r', encoding='utf-8-sig') as file:
    reader = csv.DictReader(file)
    for row in reader:
        state_id_map[row['State_Desc']] = row['State_ID']

for customer in all_customers:
    state_name = customer[6]
    state_id = state_id_map.get(state_name, '')  # Default to empty if not found
    customer.insert(7, state_id)  # Insert after the State

In [30]:
city_map = {}

with open('cities.csv', mode='r', encoding='utf-8-sig') as file:
    reader = csv.DictReader(file)
    for row in reader:
        state_id = row['State_ID']
        city_id = row['City_ID']
        if state_id not in city_map:
            city_map[state_id] = []
        city_map[state_id].append(city_id)

updated_customers = []

for customer in all_customers:
    state_id = customer[7]
    del customer[6]

    possible_cities = city_map.get(state_id, [])
    if possible_cities:
        city_id = random.choice(possible_cities)
    else:
        city_id = None

    customer.insert(6, city_id)

    updated_customers.append(customer)

all_customers = updated_customers

In [31]:
all_customers[5]

['0000000006',
 'Sara',
 'Miller',
 '05/15/1995',
 'sara.miller@orbitex.it',
 'Female',
 '0066',
 '03',
 'Regular',
 '10/24/2015']

In [32]:
columns = [
    'Customer_ID', 'First_Name', 'Last_Name', 'DOB', 'Email', 'Gender',
    'City_ID', 'State_ID', 'Customer_Type', 'Registration_Date'
]
# Convert your list of lists into a DataFrame
df = pd.DataFrame(all_customers, columns=columns)

# Remove duplicates based on the Email column
df.drop_duplicates(subset='Email', keep='first', inplace=True)

# Save to CSV
df.to_csv('customers.csv', index=False)