# Data Preprocessing

To access claim data via an API in a production-like scenario, you’d typically:

 - Authenticate with the API (via API key or OAuth).

 - Send a GET or POST request to fetch claim data.

 - Transform and use the data in your model.

 - Below is a realistic Python example that simulates fetching insurance claim data via API, using requests. 
  
 - For illustration, we'll use a placeholder API URL—you can plug in the real endpoint and auth method when available.

In [1]:
# Install required packages

# %pip install -r ./requirements.txt

In [2]:
import os
print(os.listdir('../data'))

['insurance_claims.csv', 'preprocessor.pkl', 'text_data.csv', 'X_structured.csv', 'y.csv']


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Load structured data
df = pd.read_csv('../data/insurance_claims.csv')


In [4]:
# Create a binary fraud label
df['fraud_reported'] = df['fraud_reported'].map({'Y': 1, 'N': 0})

In [5]:
print(df.head())

   months_as_customer  age  policy_number policy_bind_date policy_state  \
0                 328   48         521585       2014-10-17           OH   
1                 228   42         342868       2006-06-27           IN   
2                 134   29         687698       2000-09-06           OH   
3                 256   41         227811       1990-05-25           IL   
4                 228   44         367455       2014-06-06           IL   

  policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  \
0    250/500               1000                1406.91               0   
1    250/500               2000                1197.22         5000000   
2    100/300               2000                1413.14         5000000   
3    250/500               2000                1415.74         6000000   
4   500/1000               1000                1583.91         6000000   

   insured_zip  ... police_report_available total_claim_amount injury_claim  \
0       466132  ...      

In [6]:
print(df.info())             # Column types and nulls

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   months_as_customer           1000 non-null   int64  
 1   age                          1000 non-null   int64  
 2   policy_number                1000 non-null   int64  
 3   policy_bind_date             1000 non-null   object 
 4   policy_state                 1000 non-null   object 
 5   policy_csl                   1000 non-null   object 
 6   policy_deductable            1000 non-null   int64  
 7   policy_annual_premium        1000 non-null   float64
 8   umbrella_limit               1000 non-null   int64  
 9   insured_zip                  1000 non-null   int64  
 10  insured_sex                  1000 non-null   object 
 11  insured_education_level      1000 non-null   object 
 12  insured_occupation           1000 non-null   object 
 13  insured_hobbies    

In [7]:
# Summary stats
print(df.describe())         

       months_as_customer          age  policy_number  policy_deductable  \
count         1000.000000  1000.000000    1000.000000        1000.000000   
mean           203.954000    38.948000  546238.648000        1136.000000   
std            115.113174     9.140287  257063.005276         611.864673   
min              0.000000    19.000000  100804.000000         500.000000   
25%            115.750000    32.000000  335980.250000         500.000000   
50%            199.500000    38.000000  533135.000000        1000.000000   
75%            276.250000    44.000000  759099.750000        2000.000000   
max            479.000000    64.000000  999435.000000        2000.000000   

       policy_annual_premium  umbrella_limit    insured_zip  capital-gains  \
count            1000.000000    1.000000e+03    1000.000000    1000.000000   
mean             1256.406150    1.101000e+06  501214.488000   25126.100000   
std               244.167395    2.297407e+06   71701.610941   27872.187708   
min

In [8]:
# Missing value counts
print(df.isnull().sum())     

months_as_customer                0
age                               0
policy_number                     0
policy_bind_date                  0
policy_state                      0
policy_csl                        0
policy_deductable                 0
policy_annual_premium             0
umbrella_limit                    0
insured_zip                       0
insured_sex                       0
insured_education_level           0
insured_occupation                0
insured_hobbies                   0
insured_relationship              0
capital-gains                     0
capital-loss                      0
incident_date                     0
incident_type                     0
collision_type                    0
incident_severity                 0
authorities_contacted            91
incident_state                    0
incident_city                     0
incident_location                 0
incident_hour_of_the_day          0
number_of_vehicles_involved       0
property_damage             

In [9]:
# Simulate free-text field
df['claim_description'] = df['incident_type'] + ' reported on ' + df['incident_date'] + '. ' + df['incident_severity']

In [10]:
# Split
X_structured = df.drop(columns=['fraud_reported', 'claim_description', '_c39'])
y = df['fraud_reported']
text_data = df['claim_description']

In [11]:
display(df.head())

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39,claim_description
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,71610,6510,13020,52080,Saab,92x,2004,1,,Single Vehicle Collision reported on 2015-01-2...
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,5070,780,780,3510,Mercedes,E400,2007,1,,Vehicle Theft reported on 2015-01-21. Minor Da...
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,34650,7700,3850,23100,Dodge,RAM,2007,0,,Multi-vehicle Collision reported on 2015-02-22...
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,63400,6340,6340,50720,Chevrolet,Tahoe,2014,1,,Single Vehicle Collision reported on 2015-01-1...
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,6500,1300,650,4550,Accura,RSX,2009,0,,Vehicle Theft reported on 2015-02-17. Minor Da...


In [12]:
# Save structured data for later use
X_structured.to_csv('../data/X_structured.csv', index=False)
y.to_csv('../data/y.csv', index=False)  

# Save text data for later use
text_data.to_csv('../data/text_data.csv', index=False)