1. Importing Required Libraries

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# ML preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ML models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib


 2. Loading the Dataset

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.shape, test.shape

((1200000, 21), (800000, 20))

3. Exploratory Data Analysis (EDA)

In [3]:
train.head()
train.info()
train.describe()

# Check missing values
train.isnull().sum().sort_values(ascending=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

Unnamed: 0,0
Previous Claims,364029
Occupation,358075
Credit Score,137882
Number of Dependents,109672
Customer Feedback,77824
Health Score,74076
Annual Income,44949
Age,18705
Marital Status,18529
Vehicle Age,6


4. Data Cleaning & Preprocessing

In [None]:
#Drop Unnecessary Columns

In [4]:
drop_cols = ['Customer Feedback', 'Policy Start Date']
train.drop(columns=drop_cols, inplace=True, errors='ignore')
test.drop(columns=drop_cols, inplace=True, errors='ignore')

In [None]:
#Train-Test Split

In [5]:
X = train.drop('Premium Amount', axis=1)
y = train['Premium Amount']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Handling Missing Values

In [6]:
# Numeric columns
for col in X_train.select_dtypes(include='number').columns:
    median = X_train[col].median()
    X_train[col].fillna(median, inplace=True)
    X_valid[col].fillna(median, inplace=True)
    test[col].fillna(median, inplace=True)

# Categorical columns
for col in X_train.select_dtypes(include='object').columns:
    mode = X_train[col].mode()[0]
    X_train[col].fillna(mode, inplace=True)
    X_valid[col].fillna(mode, inplace=True)
    test[col].fillna(mode, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_valid[col].fillna(median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always be

In [None]:
# Label Encoding

In [7]:
le = LabelEncoder()
for col in X_train.select_dtypes(include='object').columns:
    X_train[col] = le.fit_transform(X_train[col])
    X_valid[col] = le.transform(X_valid[col])
    test[col] = le.transform(test[col])


In [None]:
#Feature Scaling

In [8]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_valid[num_cols] = scaler.transform(X_valid[num_cols])
test[num_cols] = scaler.transform(test[num_cols])


 5. Model Training & Evaluation

In [None]:
#Evaluation Function

In [9]:
def evaluate_model(model, X_valid, y_valid, model_name="Model"):
    y_pred = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    mae = mean_absolute_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)

    print(f" {model_name} Performance:")
    print(f" RMSE: {rmse:.2f}")
    print(f" MAE: {mae:.2f}")
    print(f" R² Score: {r2:.4f}")
    print("-" * 40)

In [10]:
#Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
evaluate_model(lr, X_valid, y_valid, "Linear Regression")

📊 Linear Regression Performance:
🔸 RMSE: 863.34
🔸 MAE: 667.33
🔸 R² Score: 0.0026
----------------------------------------


In [11]:
#Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
evaluate_model(rf, X_valid, y_valid, "Random Forest")

📊 Random Forest Performance:
🔸 RMSE: 858.13
🔸 MAE: 662.51
🔸 R² Score: 0.0146
----------------------------------------


In [12]:
#XGBoost
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)
xgb.fit(X_train, y_train)
evaluate_model(xgb, X_valid, y_valid, "XGBoost")

📊 XGBoost Performance:
🔸 RMSE: 847.99
🔸 MAE: 648.14
🔸 R² Score: 0.0377
----------------------------------------


6. Final Model Saving

In [13]:
joblib.dump(xgb, 'xgboost_model.pkl')
print(" Model saved as 'xgboost_model.pkl'")

✅ Model saved as 'xgboost_model.pkl'


###  Conclusion:
- XGBoost performed the best with the lowest RMSE and highest R².
- This model will be used for deployment via a Streamlit app.
- Further improvement possible with feature engineering and hyperparameter tuning.