Import nessasary libraries

In [45]:
import pandas as pd
import numpy as np
from joblib import dump

from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

import the dataset

In [46]:
Dataset = pd.read_csv('../Dataset/dataset.csv')

In [47]:
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10108 entries, 0 to 10107
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Client_Num               10108 non-null  int64 
 1   Customer_Age             10108 non-null  int64 
 2   Gender                   10108 non-null  object
 3   Dependent_Count          10108 non-null  int64 
 4   Education_Level          10108 non-null  object
 5   Marital_Status           10108 non-null  object
 6   state_cd                 10108 non-null  object
 7   Zipcode                  10108 non-null  int64 
 8   Car_Owner                10108 non-null  object
 9   House_Owner              10108 non-null  object
 10  Personal_loan            10108 non-null  object
 11  contact                  10108 non-null  object
 12  Customer_Job             10108 non-null  object
 13  Income                   10108 non-null  int64 
 14  Cust_Satisfaction_Score  10108 non-nul

In [48]:
#selecting numerical columns
numerical_cols = Dataset.select_dtypes(include=['int64','float64']).columns.tolist()

numerical_cols

['Client_Num',
 'Customer_Age',
 'Dependent_Count',
 'Zipcode',
 'Income',
 'Cust_Satisfaction_Score']

In [49]:
#selecting categorical columns
categorical_cols = Dataset.select_dtypes(include=['object']).columns.tolist()

categorical_cols

['Gender',
 'Education_Level',
 'Marital_Status',
 'state_cd',
 'Car_Owner',
 'House_Owner',
 'Personal_loan',
 'contact',
 'Customer_Job']

Checking each column for unknown values

In [50]:
unknown_columns = {}

for column in Dataset.columns:
    unknown_count =  Dataset[column].astype(str).str.lower().eq('unknown').sum()
    if unknown_count > 0:
        unknown_columns[column] = unknown_count

for column, count in unknown_columns.items():
    print(f"Column '{column}' has {count} 'Unknown' values.")

Column 'Education_Level' has 1515 'Unknown' values.
Column 'Marital_Status' has 744 'Unknown' values.
Column 'contact' has 1947 'Unknown' values.


Replace the unknown values with a proper values

In [51]:
# Replace 'Unknown' with NaN

for column in unknown_columns:
    Dataset[column] = Dataset[column].astype(str).str.lower().replace('unknown', np.nan)

In [52]:
#Filling the empty values

for column in unknown_columns:
    if Dataset[column].isnull().any():
        Dataset[column] = Dataset[column].fillna(Dataset[column].mode()[0])

In [53]:
print(Dataset.isnull().sum())

Client_Num                 0
Customer_Age               0
Gender                     0
Dependent_Count            0
Education_Level            0
Marital_Status             0
state_cd                   0
Zipcode                    0
Car_Owner                  0
House_Owner                0
Personal_loan              0
contact                    0
Customer_Job               0
Income                     0
Cust_Satisfaction_Score    0
dtype: int64
