<a href="https://colab.research.google.com/github/KodumuruRaja/Ensemble-Algorithms/blob/main/Light%20Gradient-Boosting-Machine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',200)
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("/content/train.csv",skiprows=1,header=None)
data.columns = ['age','workclass','fnlwgt','education','education-num','marital_Status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','Income'] 
print(data.head())

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_Status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  Income  
0          2174             0              40   United-States   <=50

In [None]:
data.Income=data.Income.astype(str)
data.Income.dtype

dtype('O')

In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
l = LabelEncoder()
data.Income = l.fit_transform(data["Income"])

In [None]:
v = pd.get_dummies(data[['workclass','education','marital_Status','occupation','relationship','race','sex','native_country']])
w = data.drop(['workclass','education','marital_Status','occupation','relationship','race','sex','native_country'],axis=1)

In [None]:
new_data = v.join(w)
_,i = np.unique(new_data.columns, return_index=True)
new_data = new_data.iloc[:,i]

In [None]:
x = new_data.drop('Income', axis=1)
y = new_data.Income

In [None]:
y.fillna(y.mode()[0], inplace=True)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# **XGBoost**

In [None]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest  = xgb.DMatrix(x_test)

In [None]:
parameter = {'max_depth' : 7, 'eta' : 1, 'silent':1, 'objective' : 'binary:logistic', 'eval_matric' : 'auc', 'learning_rate' : 0.05}
num_round = 50

In [None]:
from datetime import datetime
start = datetime.now()
xg = xgb.train(parameter,dtrain,num_round)
stop = datetime.now()

In [None]:
execution_time_xgb = stop - start
print('--'*20,execution_time_xgb,'--'*20)

---------------------------------------- 0:00:05.467221 ----------------------------------------


In [None]:
ypred = xg.predict(dtest)
print(ypred)

[0.04278268 0.42322132 0.68330294 ... 0.9580553  0.04597957 0.04278268]


In [None]:
print(ypred.shape[0])

6513


In [None]:
for i in range(ypred.shape[0]):
    if ypred[i] > 0.5:
        ypred[i] = 1
    else:
        ypred[i] = 0

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
acc_xgb = accuracy_score(y_test,ypred)
print(confusion_matrix(y_test, ypred))

[[4712  230]
 [ 636  935]]


# **Light GBM**

In [None]:
train_dataset = lgb.Dataset(x_train,label=y_train)

In [None]:
param = {'num_leaves' : 150, 'objective' : 'binary', 'max_depth' : 7, 'learning_rate' : 0.05, 'max_bin' : 200}
param['metric'] = ['auc', 'binary_logloss']
num_round = 50

In [None]:
from datetime import datetime
start = datetime.now()
lgb = lgb.train(param,train_dataset,num_round)
stop = datetime.now()

In [None]:
execution_time_lgb = stop - start
print('--'*20,execution_time_lgb,'--'*20)

---------------------------------------- 0:00:00.395326 ----------------------------------------


In [None]:
ypred2 = lgb.predict(x_test)
print(ypred2)

[0.02359206 0.42114255 0.66116852 ... 0.93998318 0.02611022 0.0222638 ]


In [None]:
for i in range(ypred2.shape[0]):
    if ypred2[i] > 0.5:
        ypred2[i] = 1
    else:
        ypred2[i] = 0

In [None]:
lgb_xgb = accuracy_score(y_test,ypred2)
print(confusion_matrix(y_test, ypred2))

[[4732  210]
 [ 660  911]]


In [None]:
from sklearn.metrics import roc_auc_score
xgb_auc = roc_auc_score(y_test,ypred)
lgb_auc = roc_auc_score(y_test,ypred2)
comparision_dict = {'accuracy score' : [acc_xgb,lgb_xgb],'auc score' : [xgb_auc,lgb_auc,],'execution time' : [execution_time_xgb,execution_time_lgb]}
comparision_df = pd.DataFrame(comparision_dict)
comparision_df.index = ['Xgboost', 'LighGBM']
print(comparision_df)

         accuracy score  auc score         execution time
Xgboost        0.867035   0.774311 0 days 00:00:05.467221
LighGBM        0.866421   0.768696 0 days 00:00:00.395326
