In [2]:
import numpy as np
import pandas as pd

# For reproducibility
np.random.seed(42)

# Number of samples
n = 500

# --- AGE (with some outliers & NaNs) ---
ages = np.random.randint(18, 81, size=n).astype(float)
ages[np.random.choice(n, 5, replace=False)] = np.random.choice([5, 120, np.nan], 5)

# --- GENDER (inconsistent categories) ---
genders = np.random.choice(['Male', 'Female', 'male', 'FEMALE', 'M', 'F'], size=n)

# --- POLICY TYPE (with NaNs and typos) ---
policy_types = np.random.choice(['Basic', 'Standard', 'Premium', 'basics', np.nan], size=n)

# --- MEDICAL HISTORY (with NaNs and inconsistencies) ---
medical_histories = np.random.choice(['None', 'Diabetes', 'Heart', 'Chronic', 'NONE', np.nan], size=n)

# --- PREMIUM (with outliers & NaNs) ---
base_premium = np.random.randint(2000, 5000, size=n).astype(float)
premium = base_premium + (np.nan_to_num(ages) * 10)
premium += np.where(policy_types == 'Premium', 3000, 0)
premium += np.where(policy_types == 'Standard', 1500, 0)
premium[np.random.choice(n, 5, replace=False)] = np.random.choice([-500, 1000000, np.nan], 5)

# --- CLAIM (rule-based + some NaNs) ---
claim_probs = (
    (np.nan_to_num(ages) > 50) * 0.3 +
    (pd.Series(medical_histories).fillna('None') != 'None') * 0.4 +
    (pd.Series(policy_types).fillna('Basic') == 'Premium') * 0.1
)
claims = np.random.binomial(1, p=np.clip(claim_probs, 0, 1)).astype(float)  # float allows NaN
claims[np.random.choice(n, 5, replace=False)] = np.nan  # inject missing claims

# --- BUILD DATAFRAME ---
df = pd.DataFrame({
    'Age': ages,
    'Gender': genders,
    'Policy_Type': policy_types,
    'Medical_History': medical_histories,
    'Premium': premium,
    'Claim': claims
})

# Add some duplicates
df = pd.concat([df, df.sample(10, random_state=42)], ignore_index=True)

# Save to CSV
df.to_csv("synthetic_insurance_dirty.csv", index=False)

# --- QUICK INSPECTION ---
print("Dataset shape:", df.shape)
print("\nMissing values:\n", df.isnull().sum())
print("\nFirst 10 rows:\n", df.head(10))


Dataset shape: (510, 6)

Missing values:
 Age                1
Gender             0
Policy_Type        0
Medical_History    0
Premium            2
Claim              5
dtype: int64

First 10 rows:
     Age  Gender Policy_Type Medical_History    Premium  Claim
0  56.0       M    Standard            NONE     6845.0    1.0
1  69.0       M         nan            None     5246.0    0.0
2  46.0  Female         nan            NONE     2940.0    1.0
3  32.0  FEMALE    Standard           Heart     5506.0    0.0
4  60.0  Female         nan        Diabetes     4761.0    1.0
5  25.0       M     Premium           Heart     8046.0    1.0
6  78.0       F    Standard             nan     5881.0    0.0
7  38.0       M         nan         Chronic     4788.0    0.0
8  56.0       F    Standard            NONE     4400.0    1.0
9  75.0    Male      basics            NONE  1000000.0    1.0
