In [35]:
import pandas as pd

import numpy as np

In [36]:
# Generate mock customer data

np.random.seed(42)

num_customers = 2000



data = {

    'CustomerID': [f'CUST{1000+i}' for i in range(num_customers)],

    'Gender': np.random.choice(['Male', 'Female'], num_customers, p=[0.5, 0.5]),

    'SeniorCitizen': np.random.choice([0, 1], num_customers, p=[0.84, 0.16]),

    'Partner': np.random.choice(['Yes', 'No'], num_customers, p=[0.48, 0.52]),

    'Dependents': np.random.choice(['Yes', 'No'], num_customers, p=[0.3, 0.7]),

    'Tenure': np.random.randint(1, 73, num_customers), # Months

    'PhoneService': np.random.choice(['Yes', 'No'], num_customers, p=[0.9, 0.1]),

    'MultipleLines': np.random.choice(['Yes', 'No', 'No phone service'], num_customers, p=[0.42, 0.48, 0.1]),

    'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], num_customers, p=[0.34, 0.44, 0.22]),

    'OnlineSecurity': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.28, 0.50, 0.22]),

    'OnlineBackup': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.34, 0.44, 0.22]),

    'DeviceProtection': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.34, 0.44, 0.22]),

    'TechSupport': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.29, 0.49, 0.22]),

    'StreamingTV': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.38, 0.40, 0.22]),

    'StreamingMovies': np.random.choice(['Yes', 'No', 'No internet service'], num_customers, p=[0.39, 0.39, 0.22]),

    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], num_customers, p=[0.55, 0.24, 0.21]),

    'PaperlessBilling': np.random.choice(['Yes', 'No'], num_customers, p=[0.59, 0.41]),

    'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'], num_customers, p=[0.34, 0.23, 0.22, 0.21]),

    'MonthlyCharges': np.random.normal(loc=65, scale=30, size=num_customers).clip(18, 120).round(2),

}

df_customers = pd.DataFrame(data)

In [37]:
# Generate TotalCharges based on Tenure and MonthlyCharges with some noise

df_customers['TotalCharges'] = (df_customers['Tenure'] * df_customers['MonthlyCharges'] * np.random.uniform(0.95, 1.05, num_customers)).round(2)


In [38]:
# Make some TotalCharges empty for realism (e.g., new customers with 0 tenure)

# df_customers.loc[df_customers['Tenure'] == 1, 'TotalCharges'] = df_customers['MonthlyCharges']

# df_customers.loc[np.random.choice(df_customers.index, size=int(num_customers*0.01), replace=False) & (df_customers['Tenure'] < 3), 'TotalCharges'] = np.nan


& operator in above code is attempting to combine two arrays of different shapes, which leads to the error.
We first filter the indices based on conditions separately and then use those indices to set TotalCharges to NaN in the next cell.

In [39]:
# Make some TotalCharges empty for realism (e.g., new customers with 0 tenure)
# For customers with tenure == 1, set TotalCharges equal to MonthlyCharges
df_customers.loc[df_customers['Tenure'] == 1, 'TotalCharges'] = df_customers['MonthlyCharges']

# Select a random sample of indices where Tenure is less than 3
indices_to_nan = np.random.choice(df_customers[df_customers['Tenure'] < 3].index,
                                  size=int(len(df_customers) * 0.01),
                                  replace=False)

# Set 'TotalCharges' to NaN for the selected indices
df_customers.loc[indices_to_nan, 'TotalCharges'] = np.nan

In [40]:
# Simulate Churn (more likely for month-to-month, higher charges, lower tenure)

churn_probability = 0.1 \

+ 0.15 * (df_customers['Contract'] == 'Month-to-month') \

+ 0.1 * (df_customers['InternetService'] == 'Fiber optic') \

+ 0.001 * (df_customers['MonthlyCharges'] - 65) \

- 0.002 * (df_customers['Tenure'] - 36) \

+ 0.1 * (df_customers['OnlineSecurity'] == 'No') \

+ 0.1 * (df_customers['TechSupport'] == 'No')



churn_probability = np.clip(churn_probability, 0.01, 0.99)

df_customers['Churn'] = np.random.binomial(1, churn_probability, num_customers).astype(str)

df_customers['Churn'] = df_customers['Churn'].replace({'1': 'Yes', '0': 'No'})


In [41]:
# Replace 'No phone service' and 'No internet service' for consistency

for col in ['MultipleLines']:

    df_customers[col] = df_customers.apply(lambda row: 'No' if row['PhoneService'] == 'No' else row[col], axis=1)



for col in ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']:

    df_customers[col] = df_customers.apply(lambda row: 'No' if row['InternetService'] == 'No' else row[col], axis=1)


In [42]:
# Save to CSV

df_customers.to_csv('telecom_churn_mock_data.csv', index=False)

print("Mock telecom churn data generated: telecom_churn_mock_data.csv")

print(df_customers.head())

print(f"\nChurn distribution:\n{df_customers['Churn'].value_counts(normalize=True)}")

Mock telecom churn data generated: telecom_churn_mock_data.csv
  CustomerID  Gender  SeniorCitizen Partner Dependents  Tenure PhoneService  \
0   CUST1000    Male              0      No         No      30          Yes   
1   CUST1001  Female              0      No        Yes      11          Yes   
2   CUST1002  Female              1      No         No      17           No   
3   CUST1003  Female              0     Yes         No      26          Yes   
4   CUST1004    Male              0     Yes        Yes      23          Yes   

  MultipleLines InternetService       OnlineSecurity  ... DeviceProtection  \
0           Yes              No                   No  ...               No   
1           Yes     Fiber optic  No internet service  ...               No   
2            No     Fiber optic                   No  ...               No   
3            No              No                   No  ...               No   
4            No     Fiber optic  No internet service  ...               