In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler,LabelEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.feature_selection import f_classif,chi2

In [2]:
df= pd.read_csv('adult.csv')

In [3]:
df.shape

(32561, 15)

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
df.income.unique()

array(['<=50K', '>50K'], dtype=object)

In [6]:
df['income']=np.where(df.income=='<=50K',0,1)

In [7]:
df.income.unique()

array([0, 1])

In [8]:
#### missing_values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [9]:
#### duplicates
df.duplicated().sum()

np.int64(24)

In [10]:
df.drop_duplicates(inplace=True,ignore_index=True)

In [11]:
df.duplicated().sum()

np.int64(0)

In [12]:
target= df[['income']]
features= df.drop(columns='income')

In [13]:
features.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States


In [14]:
### outlier capping
def outlier_capping(df,columns):
    Q1= df[columns].quantile(0.25)
    Q3= df[columns].quantile(0.75)
    IQR= Q3-Q1
    lower_extreme= Q1-1.5*IQR
    upper_extreme= Q3+1.5*IQR
    df[columns]= df[columns].apply(lambda x: lower_extreme if x<lower_extreme else upper_extreme if x>upper_extreme else x)
for col in features.select_dtypes(include=['int','float']).columns:
    outlier_capping(features,col)

In [15]:
for col in features.select_dtypes(['object']).columns:
    print(col)
    print(features[col].unique())

workclass
['?' 'Private' 'State-gov' 'Federal-gov' 'Self-emp-not-inc' 'Self-emp-inc'
 'Local-gov' 'Without-pay' 'Never-worked']
education
['HS-grad' 'Some-college' '7th-8th' '10th' 'Doctorate' 'Prof-school'
 'Bachelors' 'Masters' '11th' 'Assoc-acdm' 'Assoc-voc' '1st-4th' '5th-6th'
 '12th' '9th' 'Preschool']
marital.status
['Widowed' 'Divorced' 'Separated' 'Never-married' 'Married-civ-spouse'
 'Married-spouse-absent' 'Married-AF-spouse']
occupation
['?' 'Exec-managerial' 'Machine-op-inspct' 'Prof-specialty'
 'Other-service' 'Adm-clerical' 'Craft-repair' 'Transport-moving'
 'Handlers-cleaners' 'Sales' 'Farming-fishing' 'Tech-support'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
relationship
['Not-in-family' 'Unmarried' 'Own-child' 'Other-relative' 'Husband' 'Wife']
race
['White' 'Black' 'Asian-Pac-Islander' 'Other' 'Amer-Indian-Eskimo']
sex
['Female' 'Male']
native.country
['United-States' '?' 'Mexico' 'Greece' 'Vietnam' 'China' 'Taiwan' 'India'
 'Philippines' 'Trinadad&Tobago' '

In [16]:
features['workclass'].replace('?',np.nan,inplace=True)
features['occupation'].replace('?',np.nan,inplace=True)
features['native.country'].replace('?',np.nan,inplace=True)

In [17]:
features.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     582
dtype: int64

In [18]:
features.fillna({'workclass':df['workclass'].mode()[0],
                 'occupation':df['occupation'].mode()[0],
                 'native.country':df['native.country'].mode()[0]},inplace=True)

In [19]:
features.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64

In [20]:
features.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,78.0,Private,77053.0,HS-grad,9.0,Widowed,Prof-specialty,Not-in-family,White,Female,0.0,0.0,40.0,United-States
1,78.0,Private,132870.0,HS-grad,9.0,Widowed,Exec-managerial,Not-in-family,White,Female,0.0,0.0,32.5,United-States
2,66.0,Private,186061.0,Some-college,10.0,Widowed,Prof-specialty,Unmarried,Black,Female,0.0,0.0,40.0,United-States
3,54.0,Private,140359.0,7th-8th,4.5,Divorced,Machine-op-inspct,Unmarried,White,Female,0.0,0.0,40.0,United-States
4,41.0,Private,264663.0,Some-college,10.0,Separated,Prof-specialty,Own-child,White,Female,0.0,0.0,40.0,United-States


In [21]:
x_train,x_test,y_train,y_test= train_test_split(features,target,train_size=0.8,random_state=100,stratify=target.income)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(26029, 14)
(6508, 14)
(26029, 1)
(6508, 1)


In [22]:
cat_col= [i for i in features.columns if features[i].dtype=='O']
print(cat_col)

['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']


In [23]:
cat_col=features[cat_col]

In [24]:
for col in cat_col.columns:
    print(col)
    print(cat_col[col].unique())

workclass
['Private' 'State-gov' 'Federal-gov' 'Self-emp-not-inc' 'Self-emp-inc'
 'Local-gov' 'Without-pay' 'Never-worked']
education
['HS-grad' 'Some-college' '7th-8th' '10th' 'Doctorate' 'Prof-school'
 'Bachelors' 'Masters' '11th' 'Assoc-acdm' 'Assoc-voc' '1st-4th' '5th-6th'
 '12th' '9th' 'Preschool']
marital.status
['Widowed' 'Divorced' 'Separated' 'Never-married' 'Married-civ-spouse'
 'Married-spouse-absent' 'Married-AF-spouse']
occupation
['Prof-specialty' 'Exec-managerial' 'Machine-op-inspct' 'Other-service'
 'Adm-clerical' 'Craft-repair' 'Transport-moving' 'Handlers-cleaners'
 'Sales' 'Farming-fishing' 'Tech-support' 'Protective-serv' 'Armed-Forces'
 'Priv-house-serv']
relationship
['Not-in-family' 'Unmarried' 'Own-child' 'Other-relative' 'Husband' 'Wife']
race
['White' 'Black' 'Asian-Pac-Islander' 'Other' 'Amer-Indian-Eskimo']
sex
['Female' 'Male']
native.country
['United-States' 'Mexico' 'Greece' 'Vietnam' 'China' 'Taiwan' 'India'
 'Philippines' 'Trinadad&Tobago' 'Canada' 'Sou

In [25]:
ord_enc= OrdinalEncoder()

In [26]:
x_train[['workclass', 'education', 'marital.status',
         'occupation','relationship',
         'race', 'native.country']]= ord_enc.fit_transform(x_train[['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 
         'native.country']])

In [27]:
x_train=pd.get_dummies(x_train,dtype='int')

In [28]:
x_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,capital.gain,capital.loss,hours.per.week,native.country,sex_Female,sex_Male
23575,19.0,7.0,43887.0,11.0,9.0,4.0,4.0,3.0,4.0,0.0,0.0,32.5,38.0,0,1
29420,58.0,3.0,216851.0,6.0,5.0,4.0,5.0,3.0,4.0,0.0,0.0,40.0,7.0,0,1
29004,42.0,6.0,155657.0,11.0,9.0,0.0,0.0,1.0,2.0,0.0,0.0,32.5,38.0,1,0
27256,43.0,3.0,211860.0,11.0,9.0,4.0,7.0,4.0,2.0,0.0,0.0,32.5,38.0,1,0
13004,31.0,3.0,145377.0,9.0,13.0,2.0,11.0,0.0,4.0,0.0,0.0,52.5,38.0,0,1


In [29]:
x_test[['workclass', 'education', 'marital.status',
         'occupation','relationship',
         'race', 'native.country']]= ord_enc.transform(x_test[['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 
         'native.country']])

In [30]:
x_test=pd.get_dummies(x_test,dtype='int')

In [31]:
x_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,capital.gain,capital.loss,hours.per.week,native.country,sex_Female,sex_Male
26109,50.0,1.0,96062.0,11.0,9.0,2.0,10.0,0.0,4.0,0.0,0.0,40.0,38.0,0,1
19984,72.0,5.0,415742.0,11.0,9.0,2.0,11.0,0.0,4.0,0.0,0.0,32.5,38.0,0,1
24244,35.0,3.0,186934.0,12.0,14.0,5.0,9.0,1.0,4.0,0.0,0.0,40.0,38.0,0,1
8181,33.0,3.0,208180.0,9.0,13.0,4.0,11.0,1.0,4.0,0.0,0.0,40.0,38.0,1,0
18142,24.0,3.0,109869.0,15.0,10.0,4.0,7.0,3.0,4.0,0.0,0.0,40.0,38.0,0,1


In [32]:
x_train.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'capital.gain',
       'capital.loss', 'hours.per.week', 'native.country', 'sex_Female',
       'sex_Male'],
      dtype='object')

In [33]:
std_sca=StandardScaler()

In [34]:
x_train[['age','fnlwgt','hours.per.week']]= std_sca.fit_transform(x_train[['age','fnlwgt','hours.per.week']])
x_test[['age','fnlwgt','hours.per.week']]= std_sca.transform(x_test[['age','fnlwgt','hours.per.week']])

In [35]:
x_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,capital.gain,capital.loss,hours.per.week,native.country,sex_Female,sex_Male
23575,-1.441763,7.0,-1.505152,11.0,9.0,4.0,4.0,3.0,4.0,0.0,0.0,-1.40671,38.0,0,1
29420,1.435485,3.0,0.313641,6.0,5.0,4.0,5.0,3.0,4.0,0.0,0.0,-0.196371,7.0,0,1
29004,0.255076,6.0,-0.329841,11.0,9.0,0.0,0.0,1.0,2.0,0.0,0.0,-1.40671,38.0,1,0
27256,0.328851,3.0,0.261158,11.0,9.0,4.0,7.0,4.0,2.0,0.0,0.0,-1.40671,38.0,1,0
13004,-0.556456,3.0,-0.43794,9.0,13.0,2.0,11.0,0.0,4.0,0.0,0.0,1.82086,38.0,0,1


In [36]:
#### Feature Selection
f_class= f_classif(x_train,y_train)
pd.Series(f_class[0],index=x_train.columns).sort_values(ascending=False)

education.num     3448.219198
hours.per.week    2082.759317
relationship      1682.754749
age               1590.211776
sex_Female        1215.288377
sex_Male          1215.288377
marital.status    1086.006154
education          175.249033
race               137.507551
occupation          31.586315
native.country      18.338462
fnlwgt               4.605772
workclass            0.231972
capital.gain              NaN
capital.loss              NaN
dtype: float64

In [37]:
x_train.drop(columns=['workclass','capital.gain','capital.loss'],inplace=True)
x_test.drop(columns=['workclass','capital.gain','capital.loss'],inplace=True)

In [38]:
x_train.head()

Unnamed: 0,age,fnlwgt,education,education.num,marital.status,occupation,relationship,race,hours.per.week,native.country,sex_Female,sex_Male
23575,-1.441763,-1.505152,11.0,9.0,4.0,4.0,3.0,4.0,-1.40671,38.0,0,1
29420,1.435485,0.313641,6.0,5.0,4.0,5.0,3.0,4.0,-0.196371,7.0,0,1
29004,0.255076,-0.329841,11.0,9.0,0.0,0.0,1.0,2.0,-1.40671,38.0,1,0
27256,0.328851,0.261158,11.0,9.0,4.0,7.0,4.0,2.0,-1.40671,38.0,1,0
13004,-0.556456,-0.43794,9.0,13.0,2.0,11.0,0.0,4.0,1.82086,38.0,0,1


In [39]:
from sklearn.linear_model import LogisticRegression

#### AdaBoost - Adaptive Boosting

AdaBoost: Trains models sequentially, Assigns weights to training samples, Increases weight of misclassified points

AdaBoost Algorithm (Step-by-Step)

    Assign equal weights to all samples

    Train a weak learner

    Increase weights of misclassified points

    Train next learner on reweighted data

    Repeat

    Combine learners using weighted vote

In [40]:
ada_boost= AdaBoostClassifier(estimator=LogisticRegression(),n_estimators=100)
ada_boost.fit(x_train,y_train)
y_pred= ada_boost.predict(x_test)
accuracy_score(y_test,y_pred)

0.7808850645359557

In [41]:
params={'n_estimators':[50,100,150,200]}

In [42]:
grid_search=GridSearchCV(estimator=ada_boost,param_grid=params,cv=5)
grid_search.fit(x_train,y_train)
grid_search.best_params_, grid_search.best_score_

({'n_estimators': 50}, np.float64(0.7792077492791698))

#### Gradient_Boost

AdaBoost: 

    Reweights data points

Gradient Boosting:

    Fits models to residual errors

    Optimizes a loss function directly

    This is a major shift.

#### Algorithm (Simplified)

    Initialize predictions

    Compute residuals

    Train a weak learner on residuals

    Add it to the ensemble (scaled)

    Repeat

In [43]:
grad_boost= GradientBoostingClassifier(n_estimators=100,learning_rate=0.5,
                                       subsample=0.9,max_features=1,random_state=50)
grad_boost.fit(x_train,y_train)
y_pred= grad_boost.predict(x_test)
accuracy_score(y_test,y_pred)

0.8378918254456054

In [44]:
params={'n_estimators':[50,100],'learning_rate':[0.1,0.5,1]}

In [45]:
grid_search=GridSearchCV(estimator=grad_boost,param_grid=params,cv=5)
grid_search.fit(x_train,y_train)
grid_search.best_params_, grid_search.best_score_

({'learning_rate': 0.5, 'n_estimators': 100}, np.float64(0.8373737389393676))

#### XGBM - Extreme Gradient Boosting

In [74]:
## !pip install xgboost

In [46]:
import xgboost as xgb

In [47]:
xgbm= xgb.XGBClassifier(n_estimators=100,learning_rate=0.1,subsample=0.9,max_features=1,random_state=50,
                        reg_alpha=10,reg_lambda=10,min_child_weight=2)
xgbm.fit(x_train,y_train)
y_pred=xgbm.predict(x_test)
accuracy_score(y_test,y_pred)

0.8380454824830977

In [48]:
params={'n_estimators':[50,100],'learning_rate':[0.1,0.5,1]}

In [49]:
grid_search=GridSearchCV(estimator=xgbm,param_grid=params,cv=5)
grid_search.fit(x_train,y_train)
grid_search.best_params_, grid_search.best_score_

({'learning_rate': 0.1, 'n_estimators': 100}, np.float64(0.8394101684932369))

#### LGBM

In [82]:
### !pip install lightgbm

In [50]:
import lightgbm as lgb

In [51]:
lgbm= lgb.LGBMClassifier(n_estimators=100,learning_rate=0.1,subsample=0.9,max_features=1,random_state=50,
                        reg_alpha=10,reg_lambda=10,min_child_weight=2,boosting_type='goss',top_rate=0.2,other_rate=0.3,
                         bundled=True,max_bins=10)
lgbm.fit(x_train,y_train)
y_pred= lgbm.predict(x_test)
accuracy_score(y_test,y_pred)

[LightGBM] [Info] Number of positive: 6271, number of negative: 19758
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000887 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 26029, number of used features: 12
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.240924 -> initscore=-1.147623
[LightGBM] [Info] Start training from score -1.147623


0.8340503995082975

# Boosting combines weak learners sequentially, where each model focuses on correcting previous errors; XGBoost enhances this with regularization and system-level optimizations