Import nessasary libraries

In [26]:
import pandas as pd
import numpy as np
from joblib import dump

from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

import the dataset

In [27]:
Dataset = pd.read_csv('../Dataset/dataset.csv')

In [28]:
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10108 entries, 0 to 10107
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Client_Num               10108 non-null  int64 
 1   Customer_Age             10108 non-null  int64 
 2   Gender                   10108 non-null  object
 3   Dependent_Count          10108 non-null  int64 
 4   Education_Level          10108 non-null  object
 5   Marital_Status           10108 non-null  object
 6   state_cd                 10108 non-null  object
 7   Zipcode                  10108 non-null  int64 
 8   Car_Owner                10108 non-null  object
 9   House_Owner              10108 non-null  object
 10  Personal_loan            10108 non-null  object
 11  contact                  10108 non-null  object
 12  Customer_Job             10108 non-null  object
 13  Income                   10108 non-null  int64 
 14  Cust_Satisfaction_Score  10108 non-nul

Checking each column for unknown values

In [29]:
unknown_columns = {}

for column in Dataset.columns:
    unknown_count =  Dataset[column].astype(str).str.lower().eq('unknown').sum()
    if unknown_count > 0:
        unknown_columns[column] = unknown_count

for column, count in unknown_columns.items():
    print(f"Column '{column}' has {count} 'Unknown' values.")

Column 'Education_Level' has 1515 'Unknown' values.
Column 'Marital_Status' has 744 'Unknown' values.
Column 'contact' has 1947 'Unknown' values.


Replace the unknown values with a proper values

In [30]:
# Replace 'Unknown' with NaN

for column in unknown_columns:
    Dataset[column] = Dataset[column].astype(str).str.lower().replace('unknown', np.nan)

In [31]:
#Filling the empty values

for column in unknown_columns:
    if Dataset[column].isnull().any():
        Dataset[column] = Dataset[column].fillna(Dataset[column].mode()[0])

In [32]:
print(Dataset.isnull().sum())

Client_Num                 0
Customer_Age               0
Gender                     0
Dependent_Count            0
Education_Level            0
Marital_Status             0
state_cd                   0
Zipcode                    0
Car_Owner                  0
House_Owner                0
Personal_loan              0
contact                    0
Customer_Job               0
Income                     0
Cust_Satisfaction_Score    0
dtype: int64


split the dataset

In [33]:
X = Dataset.drop(columns=['Client_Num','state_cd','Zipcode','Income','Cust_Satisfaction_Score'])
y = Dataset['Income']

In [34]:
#selecting numerical columns
numerical_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()

numerical_cols

['Customer_Age', 'Dependent_Count']

In [35]:
#selecting categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

categorical_cols

['Gender',
 'Education_Level',
 'Marital_Status',
 'Car_Owner',
 'House_Owner',
 'Personal_loan',
 'contact',
 'Customer_Job']

Encoding the dataset for XGBoost model

In [36]:
# Create ColumnTransformer with OneHotEncoder
onehot_encoder = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'
)

# Fit and transform the data
encoded_dataset = onehot_encoder.fit_transform(X)

# Get feature names from OneHotEncoder
categorical_feature_names = onehot_encoder.named_transformers_['onehot'].get_feature_names_out(categorical_cols)

# List of remaining column names (numerical columns)
numerical_feature_names = X[numerical_cols].columns.tolist()

# Combine feature names
all_feature_names = categorical_feature_names.tolist() + numerical_feature_names

# Create DataFrame with the new feature names
X_encoded = pd.DataFrame(encoded_dataset, columns=all_feature_names)

In [37]:
X_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10108 entries, 0 to 10107
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Gender_F                       10108 non-null  float64
 1   Gender_M                       10108 non-null  float64
 2   Education_Level_doctorate      10108 non-null  float64
 3   Education_Level_graduate       10108 non-null  float64
 4   Education_Level_high school    10108 non-null  float64
 5   Education_Level_post-graduate  10108 non-null  float64
 6   Education_Level_uneducated     10108 non-null  float64
 7   Marital_Status_married         10108 non-null  float64
 8   Marital_Status_single          10108 non-null  float64
 9   Car_Owner_no                   10108 non-null  float64
 10  Car_Owner_yes                  10108 non-null  float64
 11  House_Owner_no                 10108 non-null  float64
 12  House_Owner_yes                10108 non-null 

Model Training

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y,test_size=0.3)

In [39]:
print(X.shape, X_train.shape, X_test.shape)

(10108, 10) (7075, 25) (3033, 25)


In [40]:
# Initialize the XGBoost Regressor

model = XGBRegressor()

In [41]:
model.fit(X_train, y_train)