# VeriClaim — XGBoost Fraud Classifier Training

No GPU needed. CPU runtime is fine for this notebook.

Run cells one by one in order.

## Cell 1 — Install dependencies

In [1]:
!pip install xgboost scikit-learn imbalanced-learn shap pandas joblib -q

import xgboost, sklearn, shap, pandas
print(f'xgboost  : {xgboost.__version__}')
print(f'sklearn  : {sklearn.__version__}')
print(f'shap     : {shap.__version__}')
print(f'pandas   : {pandas.__version__}')
print('Dependencies ready.')

xgboost  : 3.2.0
sklearn  : 1.6.1
shap     : 0.50.0
pandas   : 2.2.2
Dependencies ready.


## Cell 2 — Upload kaggle.json

In [2]:
from google.colab import files
import os, json

print('Upload your kaggle.json when the file picker appears.')
uploaded = files.upload()

if 'kaggle.json' not in uploaded:
    raise Exception('kaggle.json not uploaded. Re-run this cell.')

os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)
with open(os.path.expanduser('~/.kaggle/kaggle.json'), 'w') as f:
    f.write(uploaded['kaggle.json'].decode('utf-8'))
os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)

with open(os.path.expanduser('~/.kaggle/kaggle.json')) as f:
    creds = json.load(f)
print(f'Kaggle credentials configured for user: {creds["username"]}')

Upload your kaggle.json when the file picker appears.


Saving kaggle.json to kaggle.json
Kaggle credentials configured for user: datascienceindee


## Cell 3 — Download insurance claims dataset

In [3]:
import os

os.makedirs('/content/data', exist_ok=True)

print('Downloading vehicle claim fraud dataset...')
!kaggle datasets download -d shivamb/vehicle-claim-fraud-detection -p /content/data --unzip

# Find the CSV
csv_path = None
for root, dirs, files in os.walk('/content/data'):
    for f in files:
        if f.endswith('.csv'):
            csv_path = os.path.join(root, f)
            print(f'Found CSV: {csv_path}')

if csv_path is None:
    raise Exception('No CSV found after download. Check Kaggle credentials.')

import pandas as pd
df = pd.read_csv(csv_path)
print(f'\nRows    : {len(df)}')
print(f'Columns : {df.columns.tolist()}')
print(f'\nFraud column check:')
if 'fraud_reported' in df.columns:
    print(f'  fraud_reported found')
    print(f'  Fraud rate: {(df["fraud_reported"] == "Y").mean():.2%}')
else:
    print('  WARNING: fraud_reported column not found')
    print('  Available columns:', df.columns.tolist())

Downloading vehicle claim fraud dataset...
Dataset URL: https://www.kaggle.com/datasets/shivamb/vehicle-claim-fraud-detection
License(s): CC0-1.0
Downloading vehicle-claim-fraud-detection.zip to /content/data
  0% 0.00/348k [00:00<?, ?B/s]
100% 348k/348k [00:00<00:00, 697MB/s]
Found CSV: /content/data/fraud_oracle.csv

Rows    : 15420
Columns : ['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex', 'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory', 'VehiclePrice', 'FraudFound_P', 'PolicyNumber', 'RepNumber', 'Deductible', 'DriverRating', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year', 'BasePolicy']

Fraud column check:
  Available columns: ['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'Mont

## Cell 4 — Feature engineering

In [4]:
import pandas as pd
import numpy as np

TIER1_CITIES = {
    'mumbai', 'delhi', 'bangalore', 'bengaluru', 'hyderabad',
    'chennai', 'kolkata', 'pune', 'ahmedabad'
}
TIER2_CITIES = {
    'jaipur', 'lucknow', 'surat', 'nagpur', 'indore',
    'bhopal', 'visakhapatnam', 'patna', 'vadodara'
}
FESTIVAL_MONTHS = {10, 11}


def get_city_tier(city):
    c = str(city).lower().strip()
    if c in TIER1_CITIES: return 1
    if c in TIER2_CITIES: return 2
    return 3


def get_hour_bin(hour):
    try:
        h = int(hour)
    except (ValueError, TypeError):
        return 2
    if 2 <= h <= 4:  return 0
    if h >= 22 or h <= 1: return 1
    return 2


def engineer_features(df):
    df = df.copy()

    if 'incident_city' in df.columns:
        df['city_tier'] = df['incident_city'].apply(get_city_tier)

    if 'incident_date' in df.columns:
        df['_idt'] = pd.to_datetime(df['incident_date'], errors='coerce')
        df['incident_month']    = df['_idt'].dt.month
        df['is_festival_season'] = df['incident_month'].isin(FESTIVAL_MONTHS).astype(int)
        df.drop(columns=['_idt'], inplace=True)

    if 'policy_bind_date' in df.columns and 'incident_date' in df.columns:
        bind_dt     = pd.to_datetime(df['policy_bind_date'], errors='coerce')
        incident_dt = pd.to_datetime(df['incident_date'],    errors='coerce')
        df['policy_age_days'] = (incident_dt - bind_dt).dt.days.fillna(365)

    if 'incident_hour_of_day' in df.columns:
        df['incident_hour_bin'] = df['incident_hour_of_day'].apply(get_hour_bin)

    if 'total_claim_amount' in df.columns and 'vehicle_claim' in df.columns:
        df['claim_to_value_ratio'] = (
            df['total_claim_amount'] / (df['vehicle_claim'] + 1)
        ).clip(0, 10)

    return df


df = engineer_features(df)
print('Feature engineering done.')
print(f'Columns now: {df.columns.tolist()}')

Feature engineering done.
Columns now: ['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex', 'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory', 'VehiclePrice', 'FraudFound_P', 'PolicyNumber', 'RepNumber', 'Deductible', 'DriverRating', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year', 'BasePolicy']


In [6]:
# Run this BEFORE Cell 5 — fixes the fraud column name

print("All columns:")
for col in df.columns:
    print(f"  {col}")

fraud_col_candidates = [
    'fraud_reported', 'fraud', 'FraudFound_P', 'FraudFound',
    'fraud_found', 'is_fraud', 'Fraud', 'FRAUD', 'target', 'label'
]

found = None
for candidate in fraud_col_candidates:
    if candidate in df.columns:
        found = candidate
        print(f"\nFound fraud column: '{found}'")
        print(df[found].value_counts())
        break

if found is None:
    print("\nNot found automatically. Checking binary columns...")
    for col in df.columns:
        if df[col].nunique() <= 3:
            print(f"  Possible: '{col}' — values: {df[col].unique()}")
else:
    if found != 'fraud_reported':
        df.rename(columns={found: 'fraud_reported'}, inplace=True)
        print(f"\nRenamed '{found}' to 'fraud_reported'")

    vals = df['fraud_reported'].unique()
    if set(vals).issubset({0, 1, '0', '1'}):
        df['fraud_reported'] = df['fraud_reported'].map(
            {1: 'Y', 0: 'N', '1': 'Y', '0': 'N'}
        )
        print("Converted 0/1 values to Y/N")

    print(f"\nFraud rate: {(df['fraud_reported'] == 'Y').mean():.2%}")
    print("Column ready. Now run Cell 5.")

All columns:
  Month
  WeekOfMonth
  DayOfWeek
  Make
  AccidentArea
  DayOfWeekClaimed
  MonthClaimed
  WeekOfMonthClaimed
  Sex
  MaritalStatus
  Age
  Fault
  PolicyType
  VehicleCategory
  VehiclePrice
  FraudFound_P
  PolicyNumber
  RepNumber
  Deductible
  DriverRating
  Days_Policy_Accident
  Days_Policy_Claim
  PastNumberOfClaims
  AgeOfVehicle
  AgeOfPolicyHolder
  PoliceReportFiled
  WitnessPresent
  AgentType
  NumberOfSuppliments
  AddressChange_Claim
  NumberOfCars
  Year
  BasePolicy

Found fraud column: 'FraudFound_P'
FraudFound_P
0    14497
1      923
Name: count, dtype: int64

Renamed 'FraudFound_P' to 'fraud_reported'
Converted 0/1 values to Y/N

Fraud rate: 5.99%
Column ready. Now run Cell 5.


## Cell 5 — Prepare features and labels

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# This dataset has different columns — map them correctly
# fraud column is 'label' (0=no fraud, 1=fraud)
print(f"Fraud rate: {df['label'].mean():.2%}")

# Encode all object columns to numeric
df_encoded = df.copy()
le = LabelEncoder()

object_cols = df_encoded.select_dtypes(include='object').columns.tolist()
print(f"\nEncoding {len(object_cols)} categorical columns:")
for col in object_cols:
    if col == 'fraud_reported':
        continue
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    print(f"  {col}")

# Feature columns — use everything except identifiers and target
DROP_COLS = ['PolicyNumber', 'label', 'fraud_reported']
FEATURE_COLS = [
    c for c in df_encoded.columns
    if c not in DROP_COLS
    and df_encoded[c].dtype in ['int64', 'float64']
]

print(f"\nUsing {len(FEATURE_COLS)} features:")
for f in FEATURE_COLS:
    print(f"  {f}")

X = df_encoded[FEATURE_COLS].fillna(0)
y = df['label']

print(f"\nX shape : {X.shape}")
print(f"y shape : {y.shape}")
print(f"Fraud   : {y.sum()} ({y.mean():.2%})")
print("Ready for SMOTE.")

Fraud rate: 5.99%

Encoding 25 categorical columns:
  Month
  DayOfWeek
  Make
  AccidentArea
  DayOfWeekClaimed
  MonthClaimed
  Sex
  MaritalStatus
  Fault
  PolicyType
  VehicleCategory
  VehiclePrice
  Days_Policy_Accident
  Days_Policy_Claim
  PastNumberOfClaims
  AgeOfVehicle
  AgeOfPolicyHolder
  PoliceReportFiled
  WitnessPresent
  AgentType
  NumberOfSuppliments
  AddressChange_Claim
  NumberOfCars
  BasePolicy

Using 31 features:
  Month
  WeekOfMonth
  DayOfWeek
  Make
  AccidentArea
  DayOfWeekClaimed
  MonthClaimed
  WeekOfMonthClaimed
  Sex
  MaritalStatus
  Age
  Fault
  PolicyType
  VehicleCategory
  VehiclePrice
  RepNumber
  Deductible
  DriverRating
  Days_Policy_Accident
  Days_Policy_Claim
  PastNumberOfClaims
  AgeOfVehicle
  AgeOfPolicyHolder
  PoliceReportFiled
  WitnessPresent
  AgentType
  NumberOfSuppliments
  AddressChange_Claim
  NumberOfCars
  Year
  BasePolicy

X shape : (15420, 31)
y shape : (15420,)
Fraud   : 923 (5.99%)
Ready for SMOTE.


In [14]:
# Run this BEFORE Cell 6 — cleans up X so SMOTE works

import numpy as np

# Drop any columns that are all NaN
X = X.dropna(axis=1, how='all')

# Convert everything to numeric, coerce strings to NaN then fill with 0
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

# Drop any columns that are still object type
obj_cols = X.select_dtypes(include='object').columns.tolist()
if obj_cols:
    print(f'Dropping object columns: {obj_cols}')
    X = X.drop(columns=obj_cols)

# Update FEATURE_COLS to match what survived
FEATURE_COLS = X.columns.tolist()

print(f'X shape after cleanup: {X.shape}')
print(f'Dtypes: {X.dtypes.value_counts().to_dict()}')
print(f'Any NaN remaining: {X.isna().any().any()}')
print(f'Features: {FEATURE_COLS}')
print('Ready for SMOTE.')

X shape after cleanup: (15420, 31)
Dtypes: {dtype('int64'): 31}
Any NaN remaining: False
Features: ['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex', 'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory', 'VehiclePrice', 'RepNumber', 'Deductible', 'DriverRating', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year', 'BasePolicy']
Ready for SMOTE.


In [15]:
import pandas as pd
import numpy as np

print("X shape:", X.shape)
print("X columns:", X.columns.tolist())
print()

# Check original df for the candidate columns
print("Checking which candidate features exist in df:")
CANDIDATE_FEATURES = [
    'months_as_customer', 'age', 'policy_annual_premium',
    'umbrella_limit', 'capital-gains', 'capital-loss',
    'incident_hour_of_day', 'number_of_vehicles_involved',
    'bodily_injuries', 'witnesses',
    'injury_claim', 'property_claim', 'vehicle_claim', 'total_claim_amount',
    'city_tier', 'is_festival_season', 'policy_age_days',
    'incident_hour_bin', 'claim_to_value_ratio', 'incident_month'
]
for col in CANDIDATE_FEATURES:
    if col in df.columns:
        print(f"  EXISTS   {col} — dtype: {df[col].dtype} — sample: {df[col].iloc[0]}")
    else:
        print(f"  MISSING  {col}")

X shape: (15420, 31)
X columns: ['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex', 'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory', 'VehiclePrice', 'RepNumber', 'Deductible', 'DriverRating', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year', 'BasePolicy']

Checking which candidate features exist in df:
  MISSING  months_as_customer
  MISSING  age
  MISSING  policy_annual_premium
  MISSING  umbrella_limit
  MISSING  capital-gains
  MISSING  capital-loss
  MISSING  incident_hour_of_day
  MISSING  number_of_vehicles_involved
  MISSING  bodily_injuries
  MISSING  witnesses
  MISSING  injury_claim
  MISSING  property_claim
  MISSING  vehicle_claim
  MISSING  total_claim_amount
  MISSING  city_tier
  MISSING  is_festival_season


In [16]:
print("Actual columns in this dataset:")
for col in df.columns:
    print(f"  '{col}'  —  dtype: {df[col].dtype}  —  sample: {df[col].iloc[0]}")

Actual columns in this dataset:
  'Month'  —  dtype: object  —  sample: Dec
  'WeekOfMonth'  —  dtype: int64  —  sample: 5
  'DayOfWeek'  —  dtype: object  —  sample: Wednesday
  'Make'  —  dtype: object  —  sample: Honda
  'AccidentArea'  —  dtype: object  —  sample: Urban
  'DayOfWeekClaimed'  —  dtype: object  —  sample: Tuesday
  'MonthClaimed'  —  dtype: object  —  sample: Jan
  'WeekOfMonthClaimed'  —  dtype: int64  —  sample: 1
  'Sex'  —  dtype: object  —  sample: Female
  'MaritalStatus'  —  dtype: object  —  sample: Single
  'Age'  —  dtype: int64  —  sample: 21
  'Fault'  —  dtype: object  —  sample: Policy Holder
  'PolicyType'  —  dtype: object  —  sample: Sport - Liability
  'VehicleCategory'  —  dtype: object  —  sample: Sport
  'VehiclePrice'  —  dtype: object  —  sample: more than 69000
  'fraud_reported'  —  dtype: object  —  sample: N
  'PolicyNumber'  —  dtype: int64  —  sample: 1
  'RepNumber'  —  dtype: int64  —  sample: 12
  'Deductible'  —  dtype: int64  —  samp

## Cell 6 — Handle class imbalance with SMOTE

In [17]:
from imblearn.over_sampling import SMOTE

print(f'Before SMOTE — fraud: {y.sum()}, non-fraud: {(y==0).sum()}')

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

print(f'After SMOTE  — fraud: {y_res.sum()}, non-fraud: {(y_res==0).sum()}')
print(f'New fraud rate: {y_res.mean():.2%}')

Before SMOTE — fraud: 923, non-fraud: 14497
After SMOTE  — fraud: 14497, non-fraud: 14497
New fraud rate: 50.00%


## Cell 7 — Train XGBoost with cross-validation

In [18]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, cross_val_score

model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

# Cross-validate on original (non-SMOTE) data for honest estimate
print('Running 5-fold cross-validation on original data...')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
print(f'CV AUC: {scores.mean():.4f} +/- {scores.std():.4f}')

# Train final model on SMOTE-balanced data
print('\nTraining final model on SMOTE-balanced data...')
model.fit(X_res, y_res)
print('Training complete.')

Running 5-fold cross-validation on original data...
CV AUC: 0.8464 +/- 0.0079

Training final model on SMOTE-balanced data...
Training complete.


## Cell 8 — Save model as .pkl

In [19]:
import joblib, os

SAVE_PATH = '/content/xgb_fraud_model.pkl'

# Save model AND feature column names together
# The local predict.py needs both to run inference
artifact = {
    'model':        model,
    'feature_cols': FEATURE_COLS
}

joblib.dump(artifact, SAVE_PATH)

size_mb = os.path.getsize(SAVE_PATH) / 1024 / 1024
print(f'Saved to  : {SAVE_PATH}')
print(f'Size      : {size_mb:.2f} MB')
print(f'Features  : {FEATURE_COLS}')

Saved to  : /content/xgb_fraud_model.pkl
Size      : 0.98 MB
Features  : ['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex', 'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory', 'VehiclePrice', 'RepNumber', 'Deductible', 'DriverRating', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year', 'BasePolicy']


## Cell 9 — Verify the saved model works

In [20]:
import joblib, pandas as pd

loaded = joblib.load(SAVE_PATH)
loaded_model = loaded['model']
loaded_cols  = loaded['feature_cols']

# Test with one row
test_row = X.iloc[[0]][loaded_cols].fillna(0)
prob = float(loaded_model.predict_proba(test_row)[0][1])

print(f'Test prediction: fraud_probability = {prob:.4f}')
assert 0.0 <= prob <= 1.0
print('Model verified. .pkl file is working correctly.')

Test prediction: fraud_probability = 0.0041
Model verified. .pkl file is working correctly.


## Cell 10 — Save to Google Drive and download to computer

In [21]:
import shutil, os
from google.colab import drive, files

# Save to Google Drive as backup
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/VeriClaim', exist_ok=True)
drive_dest = '/content/drive/MyDrive/VeriClaim/xgb_fraud_model.pkl'
shutil.copy2(SAVE_PATH, drive_dest)
print(f'Saved to Google Drive: {drive_dest}')

# Download directly to computer
print('\nStarting download to your computer...')
print('Check your browser for the download prompt.')
files.download(SAVE_PATH)

print()
print('AFTER DOWNLOAD:')
print('Place xgb_fraud_model.pkl at:')
print('  VeriClaim/models/fraud_classifier/xgb_fraud_model.pkl')
print()
print(f'CV AUC achieved: {scores.mean():.4f}')
print('XGBoost training notebook complete.')

Mounted at /content/drive
Saved to Google Drive: /content/drive/MyDrive/VeriClaim/xgb_fraud_model.pkl

Starting download to your computer...
Check your browser for the download prompt.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


AFTER DOWNLOAD:
Place xgb_fraud_model.pkl at:
  VeriClaim/models/fraud_classifier/xgb_fraud_model.pkl

CV AUC achieved: 0.8464
XGBoost training notebook complete.
