### __Importing the dependencies__

In [22]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve,  roc_auc_score


### __Loading dataset__

In [23]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

data = pd.read_csv('../datasets/titanic.csv', usecols=use_cols)


In [24]:
data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.25
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.925


In [25]:
X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 6), (268, 6))

### __Polynomial Expansion__

<div align="justify">

Generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree.

</div>

In [26]:
pf = PolynomialFeatures(degree=2,include_bias=False).fit(X_train[['Pclass','SibSp']])
tmp = pf.transform(X_train[['Pclass','SibSp']])
X_train_copy = pd.DataFrame(tmp,columns=pf.get_feature_names_out(['Pclass','SibSp']))

In [27]:
print(X_train_copy.head(6))

   Pclass  SibSp  Pclass^2  Pclass SibSp  SibSp^2
0     1.0    0.0       1.0           0.0      0.0
1     1.0    1.0       1.0           1.0      1.0
2     3.0    5.0       9.0          15.0     25.0
3     1.0    0.0       1.0           0.0      0.0
4     3.0    1.0       9.0           3.0      1.0
5     2.0    1.0       4.0           2.0      1.0


### __Feature Learning by Trees__

<div align="justify">

GBDT derived feature + LR

</div>

In [28]:
gbdt = GradientBoostingClassifier(n_estimators=20)
one_hot = OneHotEncoder()

X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)
X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)

gbdt.fit(X_train, y_train)

X_leaf_index = gbdt.apply(X_train)[:, :, 0]



In [29]:
print("sample's belonging node of each base tree \n'",X_leaf_index)

sample's belonging node of each base tree 
' [[ 7.  7.  6. ...  4.  7.  4.]
 [ 7.  7.  6. ... 14.  7.  7.]
 [11. 11. 11. ...  4.  6. 11.]
 ...
 [10. 10. 10. ...  4.  6. 10.]
 [13. 14. 13. ...  4.  7. 13.]
 [ 7.  7.  6. ...  6.  7.  7.]]


In [30]:
one_hot.fit(X_leaf_index)   
X_one_hot = one_hot.transform(X_leaf_index)  

lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_one_hot,y_train)
y_pred = lr.predict_proba(
    one_hot.transform(gbdt.apply(X_test)[:, :, 0]))[:,1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)



In [31]:
print("AUC for GBDT derived feature + LR：", roc_auc_score(y_test, y_pred))

AUC for GBDT derived feature + LR： 0.7733035714285714
