In [10]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Generate synthetic data of 10K samples
n = 10000
data = pd.DataFrame({
    'loan_approved': np.random.choice(['Yes', 'No'], size=n, p=[0.5, 0.5]),
    'marital_satisfaction': np.random.normal(loc=0, scale=1, size=n),
    'married': np.random.choice([0, 1], size=n, p=[0.3, 0.7])  # 30% unmarried, 70% married
})

# Create marital_status column: set "Married" for married individuals and NaN for unmarried individuals
data['marital_status'] = data['married'].apply(lambda x: 'Married' if x == 1 else np.nan)

# Create missing value indicator
data['marital_status_missing'] = data['marital_status'].isna().astype(int)

# Replace missing values in marital_status with the mode (most frequent value)
mode_marital_status = data['marital_status'].mode()[0]
data['marital_status_mode_imputation'] = data['marital_status'].fillna(mode_marital_status)
missing_values = data[data['marital_status'].isna()].head(2)
non_missing_values = data[data['marital_status'].notna()].head(3)
sample_data = pd.concat([missing_values, non_missing_values])
sample_data[['loan_approved', 'marital_status', 'marital_status_mode_imputation', 'marital_status_missing']]

Unnamed: 0,loan_approved,marital_status,marital_status_mode_imputation,marital_status_missing
0,Yes,,Married,1
1,No,,Married,1
4,Yes,Married,Married,0
5,Yes,Married,Married,0
6,Yes,Married,Married,0
