In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import warnings 
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

# Load Data

In [4]:
train = pd.read_csv(r"C:\Users\JeremiasRyser\OneDrive - beyondBIM\Desktop\constructor\final project\data\train.csv")
test = pd.read_csv(r"C:\Users\JeremiasRyser\OneDrive - beyondBIM\Desktop\constructor\final project\data\test.csv")
print(f'Train: {train.shape}, Test: {test.shape}')

Train: (11504798, 12), Test: (7669866, 11)


In [5]:
test_id = test['id']

In [6]:
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [7]:
train.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [8]:
test.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
1,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
2,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
3,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
4,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


# Modify  Dtype

In [9]:
def converting_datatypes(df):
    df = df.copy()
    try:
        # Converting data types
        df['Gender'] = df['Gender'].astype('category')
        df['Vehicle_Age'] = df['Vehicle_Age'].astype('category')
        df['Vehicle_Damage'] = df['Vehicle_Damage'].astype('category')
        df['Age'] = df['Age'].astype('int8')
        df['Driving_License'] = df['Driving_License'].astype('int8')
        df['Region_Code'] = df['Region_Code'].astype('int8')
        df['Previously_Insured'] = df['Previously_Insured'].astype('int8')
        df['Annual_Premium'] = df['Annual_Premium'].astype('int32')
        df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype('int16')
        df['Vintage'] = df['Vintage'].astype('int16')
        df['Response'] = df['Response'].astype('int8')
        print(df.info(memory_usage='deep'))
    except KeyError as e:
        print(f"Error: {e} not found in DataFrame")
    except Exception as e:
        print(f"An error occurred: {e}")
    return df

In [10]:
train = converting_datatypes(train)
test = converting_datatypes(test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 11 columns):
 #   Column                Dtype   
---  ------                -----   
 0   Gender                category
 1   Age                   int8    
 2   Driving_License       int8    
 3   Region_Code           int8    
 4   Previously_Insured    int8    
 5   Vehicle_Age           category
 6   Vehicle_Damage        category
 7   Annual_Premium        int32   
 8   Policy_Sales_Channel  int16   
 9   Vintage               int16   
 10  Response              int8    
dtypes: category(3), int16(2), int32(1), int8(5)
memory usage: 175.6 MB
None
Error: 'Response' not found in DataFrame


# Define and  Transform Features

In [11]:
# Define categorical columns to be one-hot encoded
categorical_columns = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

# Apply one-hot encoding directly with pandas
train_encoded = pd.get_dummies(train, columns=categorical_columns, drop_first=True, dtype=int) 


In [12]:
# Separate features (X) and target variable (y)
X = train_encoded.loc[:, train_encoded.columns != "Response"]
y = train_encoded['Response']

In [13]:
# Standardize the features using StandardScaler
scaler = StandardScaler()

# Fit the scaler to the data
scaler.fit(X)

# Transform the data
X_scaled = scaler.transform(X)

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=101)

In [17]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.0-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 435.7 kB/s eta 0:04:47
   ---------------------------------------- 0.1/124.9 MB 880.9 kB/s eta 0:02:22
   ---------------------------------------- 0.3/124.9 MB 2.1 MB/s eta 0:01:00
   ---------------------------------------- 0.6/124.9 MB 2.9 MB/s eta 0:00:44
   ---------------------------------------- 0.9/124.9 MB 3.5 MB/s eta 0:00:36
   ---------------------------------------- 1.2/124.9 MB 3.9 MB/s eta 0:00:32
   ---------------------------------------- 1.5/124.9 MB 4.5 MB/s eta 0:00:28
    --------------------------------------- 1.8/124.9 MB 4.9 MB/s eta 0:00:26
    --------------------------------------- 2.2/124.9 MB 5.2 MB/s eta 0:00:24
    

In [18]:
from xgboost import XGBClassifier


In [23]:
xgb_params = {    
        'max_depth': 13, 
        'min_child_weight': 5,
        'learning_rate': 0.02,
        'colsample_bytree': 0.6,         
        'max_bin': 3000, 
        'n_estimators': 1500 
}

# Calculate the ratio of negative class to positive class
ratio = float(y_train.value_counts()[0]) / y_train.value_counts()[1]

# Initialize the XGBoost classifier with specified hyperparameters
model = XGBClassifier(**xgb_params, scale_pos_weight=ratio)

# Fit the classifier to the training data
XGB_model = model.fit(X_train, y_train)

# Make predictions on the test data
predictions = XGB_model.predict_proba(X_test)[:,1]

# Print the validation area under the curve
print("Validation Area Under the Curve (AUC): ", roc_auc_score(y_test, predictions))

Validation Area Under the Curve (AUC):  0.8853438162061094


it took 24 min

# Prediction

In [24]:
# Apply one-hot encoding directly with pandas
test_encoded = pd.get_dummies(test, columns=categorical_columns, drop_first=True, dtype=int)  

# Transform the `test` data using the same scaler as `train`
test_scaled = scaler.transform(test_encoded)

# Make predictions on the test data
predictions_test = XGB_model.predict_proba(test_scaled)[:,1]

In [25]:
predictions_test.shape

(7669866,)

In [26]:
new_data = pd.DataFrame()

In [27]:
new_data['id'] = test_id
new_data['Response'] = predictions_test

In [28]:
new_data.head()

Unnamed: 0,id,Response
0,11504798,0.024593
1,11504799,0.873064
2,11504800,0.691927
3,11504801,0.000401
4,11504802,0.347007


In [29]:
new_data.to_csv("xgb.csv", index=False)