## Dataset Genration

In [17]:
import numpy as np
import pandas as pd

# Set seed for reproducibility
np.random.seed(42)

# Generate CustomerID
customer_id = np.arange(1, 5001)

# Age distribution (Normal with mean 45, std 15, clipped to 18-80)
age = np.clip(np.random.normal(45, 15, 5000), 18, 80).astype(int)

# Gender (50% Male, 50% Female)
gender = np.random.choice(['Male', 'Female'], size=5000, p=[0.5, 0.5])

# ContractType distribution
contract_type = np.random.choice(['Month-to-month', 'One year', 'Two year'], 
                                 size=5000, p=[0.6, 0.25, 0.15])

# Monthly Charges (higher for fiber optic users)
monthly_charges = np.random.normal(70, 30, 5000)
monthly_charges = np.clip(monthly_charges, 20, 150)

# Tenure (0-72 months, correlated with contract type)
tenure = np.random.randint(0, 73, 5000)
tenure[contract_type == 'One year'] = np.random.randint(12, 24, np.sum(contract_type == 'One year'))
tenure[contract_type == 'Two year'] = np.random.randint(24, 73, np.sum(contract_type == 'Two year'))

# Total Charges (tenure * monthly charges with noise)
total_charges = tenure * monthly_charges + np.random.normal(0, 100, 5000)
total_charges = np.clip(total_charges, 0, None)

# Tech Support (Higher for longer contracts)
tech_support = np.random.choice(['Yes', 'No'], size=5000, p=[0.3, 0.7])
tech_support[contract_type == 'Two year'] = 'Yes'

# Internet Service
types = ['DSL', 'Fiber optic', 'No']
internet_service = np.random.choice(types, size=5000, p=[0.3, 0.5, 0.2])

# Paperless Billing (More common for month-to-month contracts)
paperless_billing = np.random.choice(['Yes', 'No'], size=5000, p=[0.7, 0.3])
paperless_billing[contract_type == 'Month-to-month'] = 'Yes'

# Payment Method
payment_method = np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], 
                                  size=5000, p=[0.4, 0.2, 0.2, 0.2])

# Churn (Higher for Month-to-month users, lower for Two-year users)
churn_prob = np.where(contract_type == 'Month-to-month', 0.4, 
              np.where(contract_type == 'One year', 0.15, 0.05))
churn = np.random.binomial(1, churn_prob)  # 1 = Yes, 0 = No
churn = np.where(churn == 1, 'Yes', 'No')

# Derived Features
average_monthly_charges = total_charges / (tenure + 1)
average_monthly_charges = np.round(average_monthly_charges, 2)
customer_lifetime_value = total_charges * (1 + np.random.uniform(0.1, 0.5, 5000))
customer_lifetime_value = np.round(customer_lifetime_value, 2)

# Create DataFrame
df = pd.DataFrame({
    'CustomerID': customer_id,
    'Age': age,
    'Gender': gender,
    'ContractType': contract_type,
    'MonthlyCharges': np.round(monthly_charges, 2),
    'TotalCharges': np.round(total_charges, 2),
    'TechSupport': tech_support,
    'InternetService': internet_service,
    'Tenure': tenure,
    'PaperlessBilling': paperless_billing,
    'PaymentMethod': payment_method,
    'Churn': churn,
    'AverageMonthlyCharges': average_monthly_charges,
    'CustomerLifetimeValue': customer_lifetime_value
})

# Ensure target churn rate
actual_churn_rate = (df['Churn'] == 'Yes').mean()
print(f'Actual Churn Rate: {actual_churn_rate * 100:.2f}%')

# Save to CSV
df.to_csv('synthetic_customer_data.csv', index=False)


Actual Churn Rate: 29.56%


In [5]:
# Save to CSV
df.to_csv(r'C:\Users\MANOJKUMAR\OneDrive\Desktop\Documents\Thinkhumble\customer_data.csv', index=False)
