<a href="https://colab.research.google.com/github/Luc1lfer/CCADMACL_EXERCISES_COM222ML-/blob/main/Exercise2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exercise 2: Use Gradient Boost for Regression

Instructions:

- Use the Dataset File to train your model
- Use the Test File to generate your results
- Use the Sample Submission file to generate the same format
Submit your results to:
https://www.kaggle.com/competitions/playground-series-s4e12/overview



In [182]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

## Dataset
Train, test and sample submission file can be found in this link
https://www.kaggle.com/competitions/playground-series-s4e12/data

## 1. Load the Data

In [183]:
import pandas as pd

# Load datasets
train_data = pd.read_csv('train.csv')  # Replace with the path to your train dataset
test_data = pd.read_csv('test.csv')    # Replace with the path to your test dataset

# Display the first few rows of the training data to understand its structure
train_data.head()


Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


## 2. Perform Data preprocessing

In [184]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# Define feature columns (assuming all columns except target are features)
X = train_data.drop(columns=['Premium Amount'])  # Replace 'Premium Amount' with the actual target column name
y = train_data['Premium Amount']

# Encode categorical columns using Ordinal Encoding
categorical_columns = X.select_dtypes(include=['object']).columns

# Apply OrdinalEncoder to categorical columns
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[categorical_columns] = encoder.fit_transform(X[categorical_columns])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## 3. Create a Pipeline

In [185]:
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Define a simple preprocessing pipeline for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns)
    ])

# Create the full pipeline with preprocessing and XGBoost
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])


## 4. Train the Model

In [186]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Define the hyperparameters manually (you can adjust them based on your understanding or previous experimentation)
params = {
    'n_estimators': 250,  # Increased to ensure more boosting rounds for a lower learning rate
    'learning_rate': 0.05,  # Lower learning rate for better generalization
    'max_depth': 4,  # Slightly reduced max depth to prevent overfitting
    'subsample': 0.85,  # A slightly higher fraction of data for each tree
    'colsample_bytree': 0.85,  # Increased slightly to reduce overfitting
    'gamma': 0.5,  # Regularization parameter to reduce overfitting
    'reg_alpha': 0.1,  # L1 regularization to add more penalty for complex models
    'reg_lambda': 0.1,  # L2 regularization to control overfitting
    'objective': 'reg:squarederror',  # Objective for regression tasks
    'random_state': 42  # For reproducibility
}

# Initialize the XGBRegressor with the defined parameters
xgb_model = xgb.XGBRegressor(**params)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = xgb_model.predict(X_val)

# Evaluate the model using Mean Squared Error
mse = mean_squared_error(y_val, y_val_pred)
print(f'Validation MSE: {mse}')

# You can also calculate RMSLE if required
import numpy as np
from sklearn.metrics import mean_squared_log_error

rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print(f'Validation RMSLE: {rmsle}')


Validation MSE: 710794.9051443897
Validation RMSLE: 1.1450616895764358


## 5. Evaluate the Model

In [187]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_squared_log_error

# Make predictions on the validation set
y_val_pred = xgb_model.predict(X_val)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_val, y_val_pred)
print(f'Validation MSE: {mse}')

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f'Validation RMSE: {rmse}')

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print(f'Validation RMSLE: {rmsle}')


Validation MSE: 710794.9051443897
Validation RMSE: 843.0865347901066
Validation RMSLE: 1.1450616895764358


## Generate Submission File

Choose the model that has the best performance to generate a submission file.

In [188]:
test_data.head()


Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [189]:

# Encode categorical columns using Ordinal Encoding
categorical_columns = X.select_dtypes(include=['object']).columns

# Apply OrdinalEncoder to categorical columns
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[categorical_columns] = encoder.fit_transform(X[categorical_columns])


In [190]:
from sklearn.preprocessing import OrdinalEncoder


categorical_columns = X_test.select_dtypes(include=['object']).columns

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train = train_data.drop(columns=['Premium Amount'])
X_train[categorical_columns] = encoder.fit_transform(X_train[categorical_columns])

X_test[categorical_columns] = encoder.transform(X_test[categorical_columns])

X_test_cleaned = X_test[X_train.columns]


In [192]:
y_test_pred = xgb_model.predict(X_test)


# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'Premium Amount': y_test_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_file.csv', index=False)
print("Submission file created: submission_file.csv")

Submission file created: submission_file.csv
