In [19]:
# importing the libraries

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score,confusion_matrix


In [20]:
# Load the data

train_ds = pd.read_csv('Train_data.csv')
train_ds.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [21]:
train_ds.describe()


Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building Dimension,Building_Type,Date_of_Occupancy,Claim
count,7160.0,7160.0,7160.0,7054.0,7160.0,6652.0,7160.0
mean,2013.669553,0.909758,0.305447,1883.72753,2.186034,1964.456404,0.228212
std,1.383769,0.239756,0.460629,2278.157745,0.940632,36.002014,0.419709
min,2012.0,0.0,0.0,1.0,1.0,1545.0,0.0
25%,2012.0,0.997268,0.0,528.0,2.0,1960.0,0.0
50%,2013.0,1.0,0.0,1083.0,2.0,1970.0,0.0
75%,2015.0,1.0,1.0,2289.75,3.0,1980.0,0.0
max,2016.0,1.0,1.0,20940.0,4.0,2016.0,1.0


In [22]:
train_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer Id         7160 non-null   object 
 1   YearOfObservation   7160 non-null   int64  
 2   Insured_Period      7160 non-null   float64
 3   Residential         7160 non-null   int64  
 4   Building_Painted    7160 non-null   object 
 5   Building_Fenced     7160 non-null   object 
 6   Garden              7153 non-null   object 
 7   Settlement          7160 non-null   object 
 8   Building Dimension  7054 non-null   float64
 9   Building_Type       7160 non-null   int64  
 10  Date_of_Occupancy   6652 non-null   float64
 11  NumberOfWindows     7160 non-null   object 
 12  Geo_Code            7058 non-null   object 
 13  Claim               7160 non-null   int64  
dtypes: float64(3), int64(4), object(7)
memory usage: 783.3+ KB


1. Data Cleaning & Preprocessing

Based on the datasets, several columns require specific attention such as 
 * NumberOfWindows which contains '.' values that represents missing data.
 * Building Dimension, Date_of_Occupancy & Geo_Code contain null values
 * Categorial Variables that need encoding (Building_Painted, Building_Fenced, Garden and Settlement )

In [24]:
# Handling missing values
# Cleaning NumberOfWindows by replacing the '.' string with nan

train_ds['NumberOfWindows'] = train_ds['NumberOfWindows'].replace('.',np.nan)

In [25]:
#convert to float first, then handle as numeric
train_ds['NumberOfWindows'] = pd.to_numeric(train_ds['NumberOfWindows'],errors='coerce')

#impute numerical missing values
num_imputer =SimpleImputer(strategy='median')
train_ds['Building Dimension'] =num_imputer.fit_transform(train_ds[['Building Dimension']])
train_ds['Date_of_Occupancy'] =num_imputer.fit_transform(train_ds[['Date_of_Occupancy']])
train_ds['NumberOfWindows'] =num_imputer.fit_transform(train_ds[['NumberOfWindows']])

#impute categorical missing values (geo_code)
train_ds['Geo_Code'] =train_ds['Geo_Code'].fillna(train_ds['Geo_Code'].mode()[0])

In [26]:
# --- 2. Feature Engineering ---

# Calculate Building Age at the time of observation
train_ds['Building_Age'] = train_ds['YearOfObservation'] - train_ds['Date_of_Occupancy']

# Drop Customer Id (non-predictive)
train_ds.drop(['Customer Id'], axis=1, inplace=True)


In [27]:
# --- 3. Encoding Categorical Variables ---

# Map binary categories to 0 and 1
le = LabelEncoder()
cat_cols = ['Building_Painted', 'Building_Fenced', 'Garden', 'Settlement', 'Geo_Code']

for col in cat_cols:
    train_ds[col] = le.fit_transform(train_ds[col].astype(str))


**Model Training**

In [28]:
# Split features and target
X = train_ds.drop('Claim', axis=1)
y = train_ds['Claim']

# Split into Training and Validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [29]:
# Initialize Model

model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict probabilities
probs = model.predict_proba(X_val)[:, 1]
preds = model.predict(X_val)


In [30]:
print(f"ROC-AUC Score: {roc_auc_score(y_val, probs):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, preds))


ROC-AUC Score: 0.6633

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.93      0.86      1105
           1       0.49      0.23      0.31       327

    accuracy                           0.77      1432
   macro avg       0.65      0.58      0.59      1432
weighted avg       0.73      0.77      0.74      1432

