In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [12]:
# Load the dataset
file_path = 'Automobile_insurance_fraud.csv'
df = pd.read_csv(file_path)

In [13]:
# Display the first few rows of the dataset
print("Dataset loaded successfully.")
print(df.head())

Dataset loaded successfully.
   months_as_customer  age  policy_number policy_bind_date policy_state  \
0                 328   48         521585       17-10-2014           OH   
1                 228   42         342868       27-06-2006           IN   
2                 134   29         687698       06-09-2000           OH   
3                 256   41         227811       25-05-1990           IL   
4                 228   44         367455       06-06-2014           IL   

  policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  \
0    250/500               1000                1406.91               0   
1    250/500               2000                1197.22         5000000   
2    100/300               2000                1413.14         5000000   
3    250/500               2000                1415.74         6000000   
4   500/1000               1000                1583.91         6000000   

   insured_zip  ... police_report_available total_claim_amount injury_claim

In [14]:
# Drop the colum '_c39' as it appears to be irrelevant
df = df.drop(columns=['_c39'], errors='ignore')

# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# Fill missing values for categorical columns with mode and numeric columns with median

df.fillna({
    'police_report_available': 'UNKNOWN','total_claim_amount': df['total_claim_amount'].median(),
}, inplace=True)

# Convert categorical columns to numerical using label Encoding
label_encoders = {}
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col != 'fraud_reported': 
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
        
# Separate features and target variable
X = df.drop(columns=['fraud_reported'])
y = df['fraud_reported']

# Ensure that feature and target variables are correctly selected
print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("Features head:\n", X.head())
print("Target head:\n", y.head())


Missing values in each column:
 months_as_customer              0
age                             0
policy_number                   0
policy_bind_date                0
policy_state                    0
policy_csl                      0
policy_deductable               0
policy_annual_premium           0
umbrella_limit                  0
insured_zip                     0
insured_sex                     0
insured_education_level         0
insured_occupation              0
insured_hobbies                 0
insured_relationship            0
capital-gains                   0
capital-loss                    0
incident_date                   0
incident_type                   0
collision_type                  0
incident_severity               0
authorities_contacted          91
incident_state                  0
incident_city                   0
incident_location               0
incident_hour_of_the_day        0
number_of_vehicles_involved     0
property_damage                 0
bodily_injuries 

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[200  20]
 [ 58  22]]
Classification Report:
               precision    recall  f1-score   support

           N       0.78      0.91      0.84       220
           Y       0.52      0.28      0.36        80

    accuracy                           0.74       300
   macro avg       0.65      0.59      0.60       300
weighted avg       0.71      0.74      0.71       300

