# Model Training

## 1.1 Import Data and Required Packages
## Importing Pandas, Numpy, Matplotlib, Seaborn and Warnings Library.

In [1]:
# Basic Import 

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Modelling
from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

# Ignore warnings
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

## Import the CSV Data as Pandas DataFrame

In [2]:
df=pd.read_csv(r"C:\Users\HP\Desktop\projects\Adult_census_Income_Prediction\notebook\data\cleaned_Adult_dataset.csv")

## Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
X=df.drop(['salary'],axis=1)
y=df['salary']

In [5]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32532,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32533,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32534,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32535,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [6]:
df['salary'].value_counts()

salary
 <=50K    24698
 >50K      7839
Name: count, dtype: int64

In [7]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
le_transformer = LabelEncoder()

X[num_features] = numeric_transformer.fit_transform(X[num_features])
for col in cat_features:
    X[col] = le_transformer.fit_transform(X[col])


In [8]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
0,0.03039,6,-1.063569,9,1.134777,4,0,1,4,1,0.148292,-0.216743,-0.035664,38
1,0.836973,5,-1.008668,9,1.134777,2,3,0,4,1,-0.145975,-0.216743,-2.222483,38
2,-0.042936,3,0.24504,11,-0.420679,0,5,1,4,1,-0.145975,-0.216743,-0.035664,38
3,1.05695,3,0.425752,1,-1.198407,2,5,0,2,1,-0.145975,-0.216743,-0.035664,38
4,-0.776193,3,1.408066,9,1.134777,2,9,5,2,0,-0.145975,-0.216743,-0.035664,4


In [9]:
y = y.map({' <=50K':0,' >50K':1})

In [10]:
X.shape,y.shape

((32537, 14), (32537,))

In [11]:
pd.concat([X,y],axis=1).corr()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
age,1.0,0.040461,-0.076447,-0.010551,0.036224,-0.266137,0.001755,-0.263744,0.029116,0.088708,0.077676,0.057745,0.068515,-0.000659,0.234037
workclass,0.040461,1.0,-0.024272,0.004824,0.003503,-0.02043,0.007159,-0.05792,0.048349,0.07157,0.031506,0.002643,0.0422,-0.001669,0.002702
fnlwgt,-0.076447,-0.024272,1.0,-0.028053,-0.043388,0.028123,-2.3e-05,0.008868,-0.021497,0.027089,0.000429,-0.01026,-0.018898,-0.06308,-0.009502
education,-0.010551,0.004824,-0.028053,1.0,0.359085,-0.038422,-0.04124,-0.011057,0.014303,-0.027433,0.030085,0.016793,0.055991,0.075495,0.079366
education-num,0.036224,0.003503,-0.043388,0.359085,1.0,-0.069161,0.070907,-0.094432,0.032011,0.012205,0.122664,0.079892,0.148422,0.088051,0.335272
marital-status,-0.266137,-0.02043,0.028123,-0.038422,-0.069161,1.0,0.034855,0.185532,-0.06793,-0.129402,-0.043368,-0.03414,-0.190432,-0.020885,-0.199199
occupation,0.001755,0.007159,-2.3e-05,-0.04124,0.070907,0.034855,1.0,-0.037429,-0.004807,0.047648,0.018006,0.009653,-0.012849,-0.002136,0.034516
relationship,-0.263744,-0.05792,0.008868,-0.011057,-0.094432,0.185532,-0.037429,1.0,-0.116051,-0.582594,-0.057947,-0.061098,-0.248875,-0.01084,-0.250948
race,0.029116,0.048349,-0.021497,0.014303,0.032011,-0.06793,-0.004807,-0.116051,1.0,0.087472,0.011154,0.018913,0.04192,0.11685,0.071847
sex,0.088708,0.07157,0.027089,-0.027433,0.012205,-0.129402,0.047648,-0.582594,0.087472,1.0,0.048489,0.045571,0.229187,0.001751,0.215969


Here is it quite visible that workclass,fnlwgt,education,occupation,race,country	 etc have minimum to no relation with the final output income of Income hence we will be dropping these

In [12]:
X = X.drop(["workclass","fnlwgt","education","occupation","race","country"],axis=1)
X

Unnamed: 0,age,education-num,marital-status,relationship,sex,capital-gain,capital-loss,hours-per-week
0,0.030390,1.134777,4,1,1,0.148292,-0.216743,-0.035664
1,0.836973,1.134777,2,0,1,-0.145975,-0.216743,-2.222483
2,-0.042936,-0.420679,0,1,1,-0.145975,-0.216743,-0.035664
3,1.056950,-1.198407,2,0,1,-0.145975,-0.216743,-0.035664
4,-0.776193,1.134777,2,5,0,-0.145975,-0.216743,-0.035664
...,...,...,...,...,...,...,...,...
32532,-0.849519,0.745913,2,5,0,-0.145975,-0.216743,-0.197650
32533,0.103716,-0.420679,2,0,1,-0.145975,-0.216743,-0.035664
32534,1.423579,-0.420679,6,4,0,-0.145975,-0.216743,-0.035664
32535,-1.216148,-0.420679,4,3,1,-0.145975,-0.216743,-1.655530


In [13]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((26029, 8), (6508, 8))

## Create an Evaluate Function to give all metrics after model Training

In [14]:
def evaluate_classification_model(true, predicted):
    """
    Calculate classification evaluation metrics.

    Parameters:
        true (array-like): True class labels.
        predicted (array-like): Predicted class labels.

    Returns:
        accuracy (float): Classification accuracy.
        precision (float): Precision score.
        recall (float): Recall score.
        f1 (float): F1-score.
    """
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    return accuracy, precision, recall, f1


In [16]:
# Define class weights for balancing classes
class_priors = y_train.value_counts(normalize=True).to_dict()

# Define class weights for balancing classes
class_weights = {0: class_priors[0], 1: class_priors[1]}

classification_models = {
    "Logistic Regression": LogisticRegression(class_weight=class_weights),
    "Random Forest Classifier": RandomForestClassifier(class_weight=class_weights),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(class_weight=class_weights),
    "XGBoost Classifier": XGBClassifier(),
    "CatBoost Classifier": CatBoostClassifier(verbose=0),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Naive Bayes": GaussianNB(priors=list(class_weights.values()))
}

model_list = []
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for model_name, model in classification_models.items():
    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_accuracy, model_train_precision, model_train_recall, model_train_f1 = evaluate_classification_model(y_train, y_train_pred)
    model_test_accuracy, model_test_precision, model_test_recall, model_test_f1 = evaluate_classification_model(y_test, y_test_pred)

    print(model_name)
    model_list.append(model_name)

    print('Model performance for Training set')
    print("- Accuracy Score: {:.4f}".format(model_train_accuracy))
    print("- Precision Score: {:.4f}".format(model_train_precision))
    print("- Recall Score: {:.4f}".format(model_train_recall))
    print("- F1 Score: {:.4f}".format(model_train_f1))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Accuracy Score: {:.4f}".format(model_test_accuracy))
    print("- Precision Score: {:.4f}".format(model_test_precision))
    print("- Recall Score: {:.4f}".format(model_test_recall))
    print("- F1 Score: {:.4f}".format(model_test_f1))

    accuracy_list.append(model_test_accuracy)
    precision_list.append(model_test_precision)
    recall_list.append(model_test_recall)
    f1_list.append(model_test_f1)

    print('=' * 35)
    print('\n')


Logistic Regression
Model performance for Training set
- Accuracy Score: 0.8054
- Precision Score: 0.8922
- Recall Score: 0.2136
- F1 Score: 0.3447
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8075
- Precision Score: 0.9167
- Recall Score: 0.2402
- F1 Score: 0.3806


Random Forest Classifier
Model performance for Training set
- Accuracy Score: 0.9181
- Precision Score: 0.9645
- Recall Score: 0.6835
- F1 Score: 0.8000
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8414
- Precision Score: 0.7588
- Recall Score: 0.5221
- F1 Score: 0.6186


K-Neighbors Classifier
Model performance for Training set
- Accuracy Score: 0.8711
- Precision Score: 0.7754
- Recall Score: 0.6506
- F1 Score: 0.7075
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8388
- Precision Score: 0.7105
- Recall Score: 0.5833
- F1 Score: 0.6406


Decision Tree Classifier
Model performance for Training set
- Acc

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


XGBoost Classifier
Model performance for Training set
- Accuracy Score: 0.8798
- Precision Score: 0.8057
- Recall Score: 0.6568
- F1 Score: 0.7237
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8631
- Precision Score: 0.7764
- Recall Score: 0.6238
- F1 Score: 0.6918


CatBoost Classifier
Model performance for Training set
- Accuracy Score: 0.8763
- Precision Score: 0.8035
- Recall Score: 0.6405
- F1 Score: 0.7128
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8672
- Precision Score: 0.7930
- Recall Score: 0.6238
- F1 Score: 0.6983


AdaBoost Classifier
Model performance for Training set
- Accuracy Score: 0.8565
- Precision Score: 0.7736
- Recall Score: 0.5670
- F1 Score: 0.6544
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8609
- Precision Score: 0.8014
- Recall Score: 0.5789
- F1 Score: 0.6722


Naive Bayes
Model performance for Training set
- Accuracy Score: 0.7995
- 

In [17]:
pd.DataFrame(list(zip(model_list, accuracy_list, precision_list, recall_list, f1_list)), columns=["model_list", "accuracy_list", "precision_list", "recall_list", "f1_list"]).sort_values(by=["accuracy_list"],ascending=False)

Unnamed: 0,model_list,accuracy_list,precision_list,recall_list,f1_list
5,CatBoost Classifier,0.86724,0.793021,0.62383,0.698324
4,XGBoost Classifier,0.863092,0.776398,0.62383,0.691802
6,AdaBoost Classifier,0.86094,0.801382,0.578915,0.67222
1,Random Forest Classifier,0.841426,0.75884,0.522146,0.618625
2,K-Neighbors Classifier,0.838814,0.710486,0.583281,0.64063
3,Decision Tree Classifier,0.817455,0.680592,0.487835,0.568314
0,Logistic Regression,0.807468,0.916667,0.240175,0.380623
7,Naive Bayes,0.796712,0.679949,0.330006,0.444351
