## BreastCancer

In [2]:
import pandas as pd
df = pd.read_csv('../Cases/Wisconsin/BreastCancer.csv')
df.head()

Unnamed: 0,Code,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
0,61634,5,4,3,1,2,2,2,3,1,Benign
1,63375,9,1,2,6,4,10,7,7,2,Malignant
2,76389,10,4,7,2,2,8,6,1,1,Malignant
3,95719,6,10,10,10,8,10,7,10,7,Malignant
4,128059,1,1,1,1,2,5,5,1,1,Benign


In [4]:
X, y = df.drop(columns=['Code', 'Class']), df['Class']

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)

In [15]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

svm= SVC(kernel='linear')
lr = LogisticRegression()
dtc = DecisionTreeClassifier(random_state=24)
rf = RandomForestClassifier(random_state=24, n_estimators = 25)
stack = StackingClassifier(estimators=[
    ('svm', svm),
    ('lr', lr),
    ('dtc', dtc),
    ], final_estimator=rf)

In [16]:
stack.fit(X_train, y_train)

In [17]:
y_pred = stack.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, pos_label='Malignant')

0.9523809523809523

`passthrough=True`

In [19]:
svm= SVC(kernel='linear')
lr = LogisticRegression()
dtc = DecisionTreeClassifier(random_state=24)
rf = RandomForestClassifier(random_state=24, n_estimators = 25)
stack = StackingClassifier(estimators=[
    ('svm', svm),
    ('lr', lr),
    ('dtc', dtc),
    ], final_estimator=rf, passthrough=True)

stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
f1_score(y_test, y_pred, pos_label='Malignant')

0.9523809523809523

## Human Resources

In [20]:
import pandas as pd
hr = pd.read_csv('../Cases/human-resources-analytics/HR_comma_sep.csv')
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [21]:
X, y = hr.drop(columns=['left']), hr['left']

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [23]:
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder
ct = make_column_transformer(
    (OneHotEncoder(drop='first', sparse_output=False), make_column_selector(dtype_include=object)),
    ('passthrough', make_column_selector(dtype_exclude=object)),
    verbose_feature_names_out=False
).set_output(transform='pandas')
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [27]:
import warnings
warnings.filterwarnings('ignore')

### Trying with Different estimators

#### With DecisionTreeClassifier as Final estimator

In [31]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

lda = LinearDiscriminantAnalysis()
lr = LogisticRegression()
gnb = GaussianNB()
dtc = DecisionTreeClassifier(random_state=24)
stack = StackingClassifier(estimators=[
    ('lr', lr),
    ('lda', lda),
    ('gnb', gnb),
    ], final_estimator=dtc, passthrough=False)

stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
f1_score(y_test, y_pred, pos_label=1)

0.6301115241635687

#### With XGBClassifier as Final estimator

In [32]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

lda = LinearDiscriminantAnalysis()
lr = LogisticRegression()
gnb = GaussianNB()
xgbc = XGBClassifier(random_state=24)
stack = StackingClassifier(estimators=[
    ('lr', lr),
    ('lda', lda),
    ('gnb', gnb),
    ], final_estimator=xgbc, passthrough=False)

stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
f1_score(y_test, y_pred, pos_label=1)

0.6656050955414012

`passthrough=True`

#### With XGBClassifier as Final estimator

In [33]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

lda = LinearDiscriminantAnalysis()
lr = LogisticRegression()
gnb = GaussianNB()
xgbc = XGBClassifier(random_state=24)
stack = StackingClassifier(estimators=[
    ('lr', lr),
    ('lda', lda),
    ('gnb', gnb),
    ], final_estimator=xgbc, passthrough=True)

stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
f1_score(y_test, y_pred, pos_label=1)

0.9719806763285024

#### With XGBClassifier as Final estimator

In [35]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

lda = LinearDiscriminantAnalysis()
lr = LogisticRegression()
gnb = GaussianNB()
xgbc = XGBClassifier(random_state=24)
stack = StackingClassifier(estimators=[
    ('lr', lr),
    ('lda', lda),
    ('gnb', gnb),
    ], final_estimator=xgbc, passthrough=True)

stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
f1_score(y_test, y_pred, pos_label=1)

0.9719806763285024

#### With Final Estimator = [dtc, xgbc] and passthrough=[True, False]

In [36]:
pt = [False, True]
finals = [dtc, xgbc]
scores = []
for f in finals:
    for p in pt:
        stack = StackingClassifier(estimators=[
            ('lr', lr),
            ('lda', lda),
            ('gnb', gnb),
            ], final_estimator=f, passthrough=p
        )
        stack.fit(X_train, y_train)
        y_pred = stack.predict(X_test)
        scores.append([f, p, f1_score(y_test, y_pred, pos_label=1)])

df_scores = pd.DataFrame(scores, columns=['Final Estimator', 'Passthorugh', 'F1_score'])
df_scores.sort_values('F1_score', ascending=False).iloc[0]

Final Estimator    XGBClassifier(base_score=None, booster=None, c...
Passthorugh                                                     True
F1_score                                                    0.971981
Name: 3, dtype: object