In [2]:
import pandas as pd

# load with low_memory=False to avoid dtype warning
df = pd.read_csv("dataset.csv", low_memory=False)
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30100 entries, 0 to 30099
Data columns (total 44 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   policy_id                         30100 non-null  object 
 1   policy_tenure                     30100 non-null  float64
 2   age_of_car                        30100 non-null  float64
 3   age_of_policyholder               30100 non-null  float64
 4   area_cluster                      30100 non-null  object 
 5   population_density                30100 non-null  object 
 6   make                              30100 non-null  int64  
 7   is_claim                          30100 non-null  int64  
 8   segment                           30100 non-null  object 
 9   model                             30100 non-null  object 
 10  fuel_type                         30100 non-null  object 
 11  max_torque                        30100 non-null  object 
 12  max_

In [3]:
# Suppose column 'age_of_car' is mixed
df['age_of_car'] = pd.to_numeric(df['age_of_car'], errors='coerce')


In [5]:
numeric_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()

for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())



In [6]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [7]:
binary_cols = [col for col in df.columns if df[col].nunique()==2 and df[col].dtype=='object']
for col in binary_cols:
    df[col] = df[col].map({'Yes':1,'No':0})


In [8]:
X = df.drop('is_claim', axis=1)
y = df['is_claim']

X = pd.get_dummies(X)
dummy_columns = X.columns.tolist()


In [10]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)



In [11]:
num_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])



  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [12]:
# ===============================
# 9. TRAIN MODEL
# ===============================
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# ===============================
# 10. EVALUATE MODEL
# ===============================
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ===============================
# 11. SAVE MODEL & PREPROCESSING OBJECTS
# ===============================
joblib.dump(model, "model.pkl")
joblib.dump(dummy_columns, "dummy_columns.pkl")
joblib.dump(num_cols, "num_cols.pkl")
joblib.dump(scaler, "scaler.pkl")

print("\n✅ Model & preprocessing objects saved successfully!")

Accuracy: 0.531063122923588
ROC AUC: 0.5704431918717041
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.53      0.68      5634
           1       0.07      0.54      0.13       386

    accuracy                           0.53      6020
   macro avg       0.51      0.54      0.40      6020
weighted avg       0.89      0.53      0.64      6020


✅ Model & preprocessing objects saved successfully!
