In [None]:

import pandas as pd
from faker import Faker
import random
import numpy as np
import matplotlib.pyplot as plt

fake = Faker()

# Generate fake data for 2000 customers
customers = []
for _ in range(2000):
    customers.append({
        'CustomerID': fake.uuid4(),
        'Name': fake.name(),
        'Email': fake.email(),
        'Age': random.randint(18, 70),
        'Gender': random.choice(['Male', 'Female']),
        'City': fake.city(),
        'Country': fake.country()
    })

customers_df = pd.DataFrame(customers)

# Generate fake data for transactions
transactions = []
for _ in range(5000):
    transactions.append({
        'TransactionID': fake.uuid4(),
        'CustomerID': random.choice(customers_df['CustomerID']),
        'ProductCategory': fake.random_element(elements=('Electronics', 'Clothing', 'Home', 'Beauty', 'Sports')),
        'Amount': round(random.uniform(10, 500), 2),
        'TransactionDateTime': fake.date_time_this_year()
    })

transactions_df = pd.DataFrame(transactions)



'''
# Extract Transaction Time
transactions_df['TransactionTime'] = transactions_df['TransactionDateTime'].dt.time

# Monthly Sales
monthly_sales = transactions_df.groupby(transactions_df['TransactionDateTime'].dt.to_period('M')).sum()['Amount']

# Average Transaction Value
avg_transaction_value = transactions_df.groupby(transactions_df['TransactionDateTime'].dt.to_period('M')).mean()['Amount']

# Customer Retention Rate (Assuming 3 months as retention period)
retention_period = 3
retention_rate = (1 - (len(transactions_df['CustomerID'].unique()) / len(customers_df))) ** retention_period

# New Customers Acquired
new_customers_acquired = len(transactions_df[transactions_df['TransactionDateTime'].dt.to_period('M') == transactions_df['TransactionDateTime'].dt.to_period('M').min()]['CustomerID'].unique())

# Monthly Sales Trend Chart
plt.figure(figsize=(10, 6))
plt.plot(monthly_sales.index.to_timestamp(), monthly_sales.values, marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Sales Amount ($)')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Product Category Sales Chart
product_category_sales = transactions_df.groupby('ProductCategory').sum()['Amount']
product_category_sales.plot(kind='bar', figsize=(10, 6))
plt.title('Product Category Sales')
plt.xlabel('Product Category')
plt.ylabel('Sales Amount ($)')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Customer Demographics
plt.figure(figsize=(10, 6))
customers_df['Age'].plot(kind='hist', bins=20, edgecolor='black')
plt.title('Customer Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6, 6))
customers_df['Gender'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Customer Gender Distribution')
plt.ylabel('')
plt.tight_layout()
plt.show()

# Peak Shopping Hours (Assuming peak shopping hours between 9 AM to 9 PM)
transactions_df['Hour'] = transactions_df['TransactionDateTime'].dt.hour
peak_hours = transactions_df[(transactions_df['Hour'] >= 9) & (transactions_df['Hour'] <= 21)].groupby('Hour').size()
peak_hours.plot(kind='bar', figsize=(10, 6))
plt.title('Peak Shopping Hours')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Transactions')
plt.grid(axis='y')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Output KPI Metrics
print("KPI Metrics:")
print("Monthly Sales:", monthly_sales)
print("Average Transaction Value:", avg_transaction_value)
print("Customer Retention Rate:", retention_rate)
print("New Customers Acquired:", new_customers_acquired)'''

# Export data to CSV files
customers_df.to_csv('customers_data.csv', index=False)
transactions_df.to_csv('transactions_data.csv', index=False)

#This code now includes a column for transaction time (`TransactionTime`). It also exports the generated customer and transaction data to CSV files named `'customers_data.csv'` and `'transactions_data.csv'`, respectively.

In [2]:
import pandas as pd
from faker import Faker
import random
import numpy as np
import matplotlib.pyplot as plt

fake = Faker()

# Generate fake data for 2000 customers
customers = []
for _ in range(2000):
    customers.append({
        'CustomerID': fake.uuid4(),
        'Name': fake.name(),
        'Email': fake.email(),
        'Age': random.randint(18, 70),
        'Gender': random.choice(['Male', 'Female']),
        'City': fake.city(),
        'Country': fake.country(),
        'FirstPurchaseDate': None,
        'LastPurchaseDate': None
    })

customers_df = pd.DataFrame(customers)

# Generate fake data for 5000 transactions
transactions = []
for _ in range(5000):
    transaction_date = fake.date_time_this_year()
    transactions.append({
        'TransactionID': fake.uuid4(),
        'CustomerID': random.choice(customers_df['CustomerID']),
        'ProductCategory': fake.random_element(elements=('Electronics', 'Clothing', 'Home', 'Beauty', 'Sports')),
        'Amount': round(random.uniform(10, 500), 2),
        'TransactionDateTime': transaction_date
    })

transactions_df = pd.DataFrame(transactions)

# Update customers with their first and last purchase dates
for customer_id in customers_df['CustomerID']:
    customer_transactions = transactions_df[transactions_df['CustomerID'] == customer_id]
    if not customer_transactions.empty:
        customers_df.loc[customers_df['CustomerID'] == customer_id, 'FirstPurchaseDate'] = customer_transactions['TransactionDateTime'].min()
        customers_df.loc[customers_df['CustomerID'] == customer_id, 'LastPurchaseDate'] = customer_transactions['TransactionDateTime'].max()
'''
# Extract additional columns
transactions_df['TransactionDate'] = transactions_df['TransactionDateTime'].dt.date
transactions_df['TransactionTime'] = transactions_df['TransactionDateTime'].dt.time
transactions_df['Month'] = transactions_df['TransactionDateTime'].dt.to_period('M')

# Monthly Sales
monthly_sales = transactions_df.groupby('Month').sum()['Amount']

# Average Transaction Value
avg_transaction_value = transactions_df.groupby('Month').mean()['Amount']

# Customer Retention Rate
# Define a function to calculate retention rate
def calculate_retention_rate(transactions_df, customers_df, period_months=3):
    retained_customers = 0
    total_customers = len(customers_df)
    current_date = transactions_df['TransactionDateTime'].max()
    retention_period = pd.DateOffset(months=period_months)
    
    for _, customer in customers_df.iterrows():
        if customer['LastPurchaseDate'] and customer['FirstPurchaseDate']:
            if (current_date - customer['LastPurchaseDate']) <= retention_period:
                retained_customers += 1
    
    return retained_customers / total_customers

retention_rate = calculate_retention_rate(transactions_df, customers_df)

# New Customers Acquired
new_customers_acquired = transactions_df[transactions_df['Month'] == first_month]['CustomerID'].nunique()

# Monthly Sales Trend Chart
plt.figure(figsize=(10, 6))
monthly_sales.plot(marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Sales Amount ($)')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Product Category Sales Chart
product_category_sales = transactions_df.groupby('ProductCategory').sum()['Amount']
product_category_sales.plot(kind='bar', figsize=(10, 6))
plt.title('Product Category Sales')
plt.xlabel('Product Category')
plt.ylabel('Sales Amount ($)')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Customer Demographics
plt.figure(figsize=(10, 6))
customers_df['Age'].plot(kind='hist', bins=20, edgecolor='black')
plt.title('Customer Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6, 6))
customers_df['Gender'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Customer Gender Distribution')
plt.ylabel('')
plt.tight_layout()
plt.show()

# Peak Shopping Hours (Assuming peak shopping hours between 9 AM to 9 PM)
transactions_df['Hour'] = transactions_df['TransactionDateTime'].dt.hour
peak_hours = transactions_df[(transactions_df['Hour'] >= 9) & (transactions_df['Hour'] <= 21)].groupby('Hour').size()
peak_hours.plot(kind='bar', figsize=(10, 6))
plt.title('Peak Shopping Hours')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Transactions')
plt.grid(axis='y')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Output KPI Metrics
print("KPI Metrics:")
print("Monthly Sales:")
print(monthly_sales)
print("\nAverage Transaction Value:")
print(avg_transaction_value)
print("\nCustomer Retention Rate:", retention_rate)
print("\nNew Customers Acquired:", new_customers_acquired)'''

# Export data to CSV files
customers_df.to_csv('customers_data_v1.csv', index=False)
transactions_df.to_csv('transactions_data_v1.csv', index=False)
