predicting claim amounts using XGBoost regression.

data genaretion

In [None]:
!pip install faker

Collecting faker
  Downloading Faker-28.4.1-py3-none-any.whl.metadata (15 kB)
Downloading Faker-28.4.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-28.4.1


In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random

In [None]:


fake = Faker()


def generate_synthetic_data(n=1000):
    data = []
    for _ in range(n):

        main_member_name = fake.name()
        gender = random.choice(['Male', 'Female'])
        age = random.randint(18, 80)
        cover_amount = random.randint(10000, 500000)
        relationship = random.choice(['Main Member', 'Spouse', 'Child', 'Other'])
        claim_date = fake.date_between(start_date='-5y', end_date='today')
        claim_amount = np.random.normal(loc=5000, scale=2000) + 0.1 * cover_amount
        occupation = random.choice(['Employed', 'Unemployed', 'Retired', 'Student'])
        payment_status = random.choice(['Paid', 'Unpaid', 'Irregular'])

        # Append the generated row
        data.append([main_member_name, gender, age, cover_amount, relationship, claim_date, claim_amount, occupation, payment_status])


    columns = ['Main_Member_Name', 'gender', 'age', 'Cover_Amount', 'relationship', 'claim_date', 'claim_amount', 'occupation', 'payment_status']
    return pd.DataFrame(data, columns=columns)


df = generate_synthetic_data(1000)

In [None]:
# Show the first few rows
print(df.head())


  Main_Member_Name  gender  age  Cover_Amount relationship  claim_date  \
0       Laura Mack    Male   46        189467        Other  2021-06-23   
1  Samantha Harris  Female   71         23231        Other  2020-01-18   
2      Rhonda Luna    Male   49         73362        Child  2023-06-21   
3     Peter Taylor    Male   28        485633  Main Member  2024-08-12   
4        Justin Wu    Male   62        481908  Main Member  2023-02-04   

   claim_amount  occupation payment_status  
0  24410.425008  Unemployed           Paid  
1  11001.807977    Employed      Irregular  
2  14789.451318     Student      Irregular  
3  57180.952704  Unemployed      Irregular  
4  54544.691460     Student         Unpaid  


xG boost model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
# Convert categorical variables to numerical using Label Encoding
label_encoders = {}
for column in ['gender', 'relationship', 'occupation', 'payment_status']:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [None]:
# Convert the claim_date to a numeric value (e.g., days since epoch)
df['claim_date'] = pd.to_datetime(df['claim_date'])
df['claim_date'] = (df['claim_date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')

In [None]:
# Split the dataset into features and target variable
X = df.drop(columns=['Main_Member_Name', 'claim_amount'])  # Drop non-numeric and target columns
y = df['claim_amount']


In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

XGBoost Model

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Initialize the XGBoost Regressor
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6)


In [None]:
# Train the model
xg_reg.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the test set
y_pred = xg_reg.predict(X_test_scaled)

In [None]:

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 5466196.33
R^2 Score: 0.97


Predict Claim Amount for New Data

In [None]:
new_client_data = pd.DataFrame({
    'gender': [label_encoders['gender'].transform(['Male'])],
    'age': [45],
    'Cover_Amount': [250000],
    'relationship': [label_encoders['relationship'].transform(['Main Member'])],
    'claim_date': [(pd.Timestamp('2024-01-01') - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')],
    'occupation': [label_encoders['occupation'].transform(['Employed'])],
    'payment_status': [label_encoders['payment_status'].transform(['Paid'])]
})


In [None]:
new_client_data_scaled = scaler.transform(new_client_data)

In [None]:
# Predict the claim amount
predicted_claim_amount = xg_reg.predict(new_client_data_scaled)
print(f"Predicted Claim Amount: P{predicted_claim_amount[0]:.2f}")

Predicted Claim Amount: R31626.57
