In [21]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Generate data
data_size = 1000
data = {
    'Retention Call Count': np.random.poisson(lam=1.5, size=data_size),
    'Age': np.random.randint(18, 70, size=data_size),
    'Income': np.random.randint(3000, 20000, size=data_size),
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=data_size),
    'Churn': np.random.choice([1, 0], size=data_size, p=[0.3, 0.7])
}

# Adjust churn rate based on retention calls
data['Churn'] = [np.random.choice([1, 0], p=[0.7, 0.3]) if x > 2 else data['Churn'][i] for i, x in enumerate(data['Retention Call Count'])]

df = pd.DataFrame(data)

# Show the first few rows of the dataframe
df.head()

Unnamed: 0,Retention Call Count,Age,Income,Region,Churn
0,3,26,8119,South,1
1,0,25,8694,East,1
2,0,69,7164,North,0
3,0,62,8039,South,0
4,3,34,17631,South,0


In [22]:
data2 = {
    'Retention Call Count': np.random.poisson(lam=1.5, size=data_size).astype(float),  # Use float for potential NaNs
    'Age': np.random.randint(18, 70, size=data_size).astype(float),
    'Income': np.random.randint(3000, 20000, size=data_size).astype(float),
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=data_size),
    'Churn': np.random.choice([1, 0], size=data_size, p=[0.3, 0.7])
}
# Introduce nulls
columns_with_nulls = ['Retention Call Count', 'Age', 'Income']
for column in columns_with_nulls:
    # Randomly choose 10% of the indices for each column to be null
    indices = np.random.choice(data_size, size=int(data_size * 0.1), replace=False)
    data2[column][indices] = np.nan  # Use np.nan instead of None for float columns

# Adjust churn rate based on retention calls, handling None values
data2['Churn'] = [np.random.choice([1, 0], p=[0.7, 0.3]) if x is not None and x > 2 else data2['Churn'][i] for i, x in enumerate(data2['Retention Call Count'])]

# Handle Region column for nulls separately because it's categorical
region_indices = np.random.choice(data_size, size=int(data_size * 0.1), replace=False)
for index in region_indices:
    data2['Region'][index] = None

df_null = pd.DataFrame(data2)

# Show the first few rows of the dataframe
print(df_null.head())


   Retention Call Count   Age   Income Region  Churn
0                   1.0  33.0  18924.0   East      0
1                   3.0  36.0  19236.0   East      0
2                   NaN   NaN  19924.0   East      0
3                   0.0  42.0   4415.0  South      0
4                   2.0  48.0  14030.0   West      0
