##Preparing Dataset

In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/KUxDEPA-Data-Science-and-Machine-Learning-Training-Course/dataset/01-census-income.csv')

In [None]:
df2 = pd.read_csv('/content/drive/MyDrive/KUxDEPA-Data-Science-and-Machine-Learning-Training-Course/dataset/02-future-census.csv')

In [None]:
df1.columns

Index(['age', 'workclass', 'weight', 'education', 'edu num', 'marital status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native country', 'label'],
      dtype='object')

In [None]:
cols=['age', 'edu num', 'marital status','occupation', 'sex', 'capital-gain',
       'capital-loss','hours-per-week']

In [None]:
X_train = df1[cols]
X_test = df2[cols]

In [None]:
y_train = df1['label']
y_test = df2['label']

##Transform Data

In [None]:
from sklearn.preprocessing import OneHotEncoder ,StandardScaler
from sklearn.compose import make_column_transformer

In [None]:
transformer = make_column_transformer(
    (OneHotEncoder(),['marital status','occupation', 'sex']),
    (StandardScaler(),['age', 'edu num', 'capital-gain',
       'capital-loss','hours-per-week'])
)

In [None]:
transformer.fit(X_train)

ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(),
                                 ['marital status', 'occupation', 'sex']),
                                ('standardscaler', StandardScaler(),
                                 ['age', 'edu num', 'capital-gain',
                                  'capital-loss', 'hours-per-week'])])

In [None]:
X_train_transformed = transformer.transform(X_train)
X_test_transformed = transformer.transform(X_test)

In [None]:
transformer.get_feature_names_out()

array(['onehotencoder__marital status_ Divorced',
       'onehotencoder__marital status_ Married-AF-spouse',
       'onehotencoder__marital status_ Married-civ-spouse',
       'onehotencoder__marital status_ Married-spouse-absent',
       'onehotencoder__marital status_ Never-married',
       'onehotencoder__marital status_ Separated',
       'onehotencoder__marital status_ Widowed',
       'onehotencoder__occupation_ ?',
       'onehotencoder__occupation_ Adm-clerical',
       'onehotencoder__occupation_ Armed-Forces',
       'onehotencoder__occupation_ Craft-repair',
       'onehotencoder__occupation_ Exec-managerial',
       'onehotencoder__occupation_ Farming-fishing',
       'onehotencoder__occupation_ Handlers-cleaners',
       'onehotencoder__occupation_ Machine-op-inspct',
       'onehotencoder__occupation_ Other-service',
       'onehotencoder__occupation_ Priv-house-serv',
       'onehotencoder__occupation_ Prof-specialty',
       'onehotencoder__occupation_ Protective-serv',

##Import Model Algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

#Ensemble Method

##Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
voting = VotingClassifier(estimators =[
              ('knn',KNeighborsClassifier(35)),
              ('dtree',DecisionTreeClassifier(max_depth=10)),
              ('logit',LogisticRegression(max_iter=1000)),
              ('svm',SVC(kernel='poly')),
              ('gnb',GaussianNB())
])

In [None]:
voting.fit(X_train_transformed.toarray(),y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=35)),
                             ('dtree', DecisionTreeClassifier(max_depth=10)),
                             ('logit', LogisticRegression(max_iter=1000)),
                             ('svm', SVC(kernel='poly')),
                             ('gnb', GaussianNB())])

In [None]:
voting.score(X_train_transformed.toarray(),y_train)

0.8595973312865325

In [None]:
voting.score(X_test_transformed.toarray(),y_test)

0.8533988533988534

##Bagging and Pasting

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bag = BaggingClassifier(
    KNeighborsClassifier(35),
    max_samples=1000,
    n_estimators=500,
    oob_score=True
)

In [None]:
bag.fit(X_train_transformed,y_train)

BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=35),
                  max_samples=1000, n_estimators=500, oob_score=True)

In [None]:
bag.score(X_train_transformed,y_train)

0.8350357206116785

In [None]:
bag.score(X_test_transformed,y_test)

0.8338452088452089

In [None]:
bag.oob_score_

0.8347405089449135

##Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
forest = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    oob_score=True,
    # n_jobs=-1
)

In [None]:
forest.fit(X_train_transformed,y_train)

RandomForestClassifier(max_depth=5, oob_score=True)

In [None]:
forest.oob_score_

0.8482021609494007

In [None]:
forest.score(X_train_transformed,y_train)

0.8497372616165791

In [None]:
forest.score(X_test_transformed,y_test)

0.8453112203112203

##AdaBoosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=30,
    learning_rate=0.5,
    random_state=42
)

ada_clf.fit(X_train_transformed,y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=30, random_state=42)

In [None]:
ada_clf.score(X_train_transformed,y_train)

0.8560252701186751

In [None]:
ada_clf.score(X_test_transformed,y_test)

0.8565724815724816

##Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier()

In [None]:
gbc.fit(X_train_transformed,y_train)

GradientBoostingClassifier()

In [None]:
gbc.score(X_train_transformed,y_train)

0.8679813426226605

In [None]:
gbc.score(X_test_transformed,y_test)

0.8648648648648649

##Stacking (stacked generalization)

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
stacking_clf = StackingClassifier(
    estimators=[
              ('knn',KNeighborsClassifier(35)),
              ('dtree',DecisionTreeClassifier(max_depth=10)),
              ('logit',LogisticRegression(max_iter=1000)),
              ('svm',SVC(kernel='poly')),
              ('gnb',GaussianNB())
    ],
    final_estimator=RandomForestClassifier(),
    cv=5
)
stacking_clf.fit(X_train_transformed.toarray(),y_train)

StackingClassifier(cv=5,
                   estimators=[('knn', KNeighborsClassifier(n_neighbors=35)),
                               ('dtree', DecisionTreeClassifier(max_depth=10)),
                               ('logit', LogisticRegression(max_iter=1000)),
                               ('svm', SVC(kernel='poly')),
                               ('gnb', GaussianNB())],
                   final_estimator=RandomForestClassifier())

In [None]:
stacking_clf.score(X_train_transformed.toarray(),y_train)

0.8732656314577553

In [None]:
stacking_clf.score(X_test_transformed.toarray(),y_test)

0.8530917280917281