# Modeling_Stacking Practice
- based on sklean example codes

In [None]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

In [None]:
X, y = load_iris(return_X_y=True)

In [None]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=5, random_state=42)),
    ('ada', AdaBoostClassifier(random_state = 42)),
    ('svr', make_pipeline(StandardScaler(),
                          LinearSVC(random_state=30)))
]

In [None]:
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42
)

In [None]:
clf.fit(X_train, y_train).score(X_test, y_test)

# Modeling_Stacking practice
 - using credit card data

## Libraray

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [80]:
train = pd.read_csv(r'../data/train.csv', usecols= ['gender', 'reality', 'car', 'credit'])
test = pd.read_csv(r'../data/test.csv', usecols= ['gender', 'reality', 'car'])
submission = pd.read_csv(r'../data/sample_submission.csv')

In [81]:
train_x = train.drop(['credit'], axis = 1)
train_y = train[['credit']]
test_x = test

In [62]:
train_x = pd.get_dummies(train_x, drop_first=True)
test_x = pd.get_dummies(test_x, drop_first=True)

In [63]:
train_x

Unnamed: 0,gender_M,car_Y,reality_Y
0,0,0,0
1,0,0,1
2,1,1,1
3,0,0,1
4,0,1,1
...,...,...,...
26452,0,0,0
26453,0,0,1
26454,0,1,0
26455,1,0,1


## Preprocessing & EDA

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import f1_score
from tensorflow.keras.utils import to_categorical

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, 
                                                    stratify=train_y, test_size=0.25,
                                                    random_state = 10086)

print("Train set: ")
print(X_train.shape)
print(y_train.shape)
print("===========")
print("Validation set: ")
print(X_val.shape)
print(y_val.shape)

Train set: 
(19842, 3)
(19842, 1)
Validation set: 
(6615, 3)
(6615, 1)


In [65]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=3, random_state=42)),
    ('ada', AdaBoostClassifier())
]

In [76]:
y_val['credit']

36       2.0
3464     2.0
2640     2.0
22500    1.0
17275    1.0
        ... 
6806     1.0
23096    2.0
7756     2.0
23331    2.0
25457    2.0
Name: credit, Length: 6615, dtype: float64

## Modeling

In [66]:
clf_stk = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf_stk.fit(X_train, y_train)
y_pred_cbt = clf_stk.predict_proba(X_val)


print("훈련 세트 정확도: {:.3f}".format(clf_stk.score(X_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(clf_stk.score(X_val, y_val)))

print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred_cbt)}")

  return f(*args, **kwargs)


훈련 세트 정확도: 0.641
테스트 세트 정확도: 0.641
log_loss: 0.8820688602489394


In [67]:
from sklearn.model_selection import KFold, StratifiedKFold

def run_kfold(clf_stk):
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=55)
    outcomes=[]
    sub=np.zeros((test_x.shape[0], 3))  
    for n_fold, (train_index, val_index) in enumerate(folds.split(train_x, train_y)):
        X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
        clf_stk.fit(X_train, y_train)
        
        predictions=clf_stk.predict_proba(X_val)
        
        logloss=log_loss(to_categorical(y_val['credit']), predictions)
        outcomes.append(logloss)
        print(f"FOLD {n_fold} : logloss:{logloss}")
        
        sub+=clf_stk.predict_proba(test_x)
        
        
    mean_outcome=np.mean(outcomes)
    
    print("Mean:{}".format(mean_outcome))
    return sub/folds.n_splits

my_submission = run_kfold(clf_stk)

  return f(*args, **kwargs)


FOLD 0 : logloss:0.8819867954246048


  return f(*args, **kwargs)


FOLD 1 : logloss:0.8826132357004908


  return f(*args, **kwargs)


FOLD 2 : logloss:0.8819091975765605


  return f(*args, **kwargs)


FOLD 3 : logloss:0.8818018553265564


  return f(*args, **kwargs)


FOLD 4 : logloss:0.882162103484039
Mean:0.8820946375024503


In [75]:
my_submission

array([[0.12482105, 0.2175806 , 0.65759835],
       [0.12207038, 0.23955459, 0.63837504],
       [0.12207038, 0.23955459, 0.63837504],
       ...,
       [0.12207038, 0.23955459, 0.63837504],
       [0.12209362, 0.23676862, 0.64113776],
       [0.12207038, 0.23955459, 0.63837504]])

In [83]:
submission.loc[:, 1:] = my_submission

  """Entry point for launching an IPython kernel.


In [84]:
submission

Unnamed: 0,index,0,1,2
0,26457,0.124821,0.217581,0.657598
1,26458,0.122070,0.239555,0.638375
2,26459,0.122070,0.239555,0.638375
3,26460,0.124821,0.217581,0.657598
4,26461,0.121122,0.236673,0.642205
...,...,...,...,...
9995,36452,0.121122,0.236673,0.642205
9996,36453,0.122212,0.234017,0.643772
9997,36454,0.122070,0.239555,0.638375
9998,36455,0.122094,0.236769,0.641138
