## Loading Data

In [43]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
import mltools as ml

warnings.filterwarnings('ignore')
X = np.genfromtxt('X_train.txt', delimiter=None)
Y = np.genfromtxt('Y_train.txt', delimiter=None)
Xte = np.genfromtxt('X_test.txt', delimiter=None)

Xtr, Xva, Ytr, Yva = train_test_split(X, Y, test_size = 0.1, random_state=0)
XtrP = StandardScaler().fit_transform(Xtr)
XvaP = StandardScaler().fit_transform(Xva)

print(Xtr.shape, Xva.shape, Ytr.shape, Yva.shape)

(180000, 14) (20000, 14) (180000,) (20000,)


## KNN

In [2]:
from sklearn.neighbors import KNeighborsClassifier

In [3]:
correlations = dict([])
for i in range(14):
    for j in range(i+1, 14):
        correlations[np.corrcoef(XtrP[:,i], XtrP[:,j])[0][1]] = str(i) + ',' + str(j)
print('min correlation of', min(correlations), 'at', correlations[min(correlations)])
print('max correlation of', max(correlations), 'at', correlations[max(correlations)])

min correlation of -0.9692805658978815 at 3,13
max correlation of 0.9456425468737595 at 0,3


In [4]:
XtrKNN = XtrP[:, 0]
XvaKNN = XvaP[:, 0]
for i in range(1, 14):
    if i not in [3, 13]:
        XtrKNN = np.column_stack((XtrKNN, XtrP[:,i]))
        XvaKNN = np.column_stack((XvaKNN, XvaP[:,i]))
XtrKNN = np.column_stack((XtrKNN, np.multiply(-XtrP[:,3], XtrP[:,13])))
XvaKNN = np.column_stack((XvaKNN, np.multiply(-XvaP[:,3], XvaP[:,13])))

In [5]:
# for n_neighbors in [7,8,9,10,11,12,13,14,15]:
#     neigh = KNeighborsClassifier(n_neighbors)
#     neigh.fit(XtrKNN, Ytr)
#     print(n_neighbors, metrics.roc_auc_score(Yva,neigh.predict_proba(XvaKNN)[:,1]))

In [6]:
neigh = KNeighborsClassifier(10)
neigh.fit(XtrKNN, Ytr)
print('Training AUC', metrics.roc_auc_score(Ytr,neigh.predict_proba(XtrKNN)[:,1]))
print('Validation AUC', metrics.roc_auc_score(Yva,neigh.predict_proba(XvaKNN)[:,1]))

Training AUC 0.8269270001007643
Validation AUC 0.731799663583667


## Linear Model

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

In [8]:
XtrPoly = PolynomialFeatures(degree=2).fit_transform(XtrP)
XvaPoly = PolynomialFeatures(degree=2).fit_transform(XvaP)

print(XtrPoly.shape, XvaP.shape)

(180000, 120) (20000, 14)


In [9]:
lr = LogisticRegression(tol=1e-4, max_iter=5000)
lr.fit(XtrPoly, Ytr)

LogisticRegression(max_iter=5000)

In [10]:
print("Training AUC:", metrics.roc_auc_score(Ytr, lr.predict_proba(XtrPoly)[:,1]))
print("Validation AUC:", metrics.roc_auc_score(Yva, lr.predict_proba(XvaPoly)[:,1]))

Training AUC: 0.6811929489017979
Validation AUC: 0.6825083495913075


## Random Forests

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
rf = RandomForestClassifier(n_estimators=500, max_depth=35, min_samples_leaf=4, min_samples_split = 4, max_features=4)
rf.fit(XtrP, Ytr)

RandomForestClassifier(max_depth=35, max_features=4, min_samples_leaf=4,
                       min_samples_split=4, n_estimators=500)

In [13]:
print("Training AUC:", metrics.roc_auc_score(Ytr, rf.predict_proba(XtrP)[:,1]))
print("Validation AUC:", metrics.roc_auc_score(Yva, rf.predict_proba(XvaP)[:,1]))

Training AUC: 0.9524737687985823
Validation AUC: 0.7644580645372071


    100 estimators, max_depth=30: 0.7556847954460596
    100 estimators, max_depth=30, min_samples_split=4: 0.7676214147735156
    100 estimators, max_depth=30, min_samples_leaf=4: 0.7796876858950488
    100 estimators, min_samples_leaf=3, min_samples_split = 3, max_features=2,0.7796945078531264
    100 estimators, min_samples_leaf=3, min_samples_split = 3, max_features=3,0.7837683270635156
    100 estimators, min_samples_leaf=4, min_samples_split = 4, max_features=4,0.7850483251095345
    100 estimators, min_samples_leaf=5, min_samples_split = 5, max_features=4,0.7843246789470857
    100 estimators, min_samples_leaf=5, min_samples_split = 5, max_features=5,0.7836942830214536


## Boosted Learners

In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

In [25]:
abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(min_samples_split=2**6, max_depth=8), 
                         n_estimators=120, learning_rate=.5)
abc.fit(XtrP, Ytr)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=8,
                                                         min_samples_split=64),
                   learning_rate=0.5, n_estimators=120)

In [26]:
print("Training AUC:", metrics.roc_auc_score(Ytr, abc.predict_proba(XtrP)[:,1]))
print("Validation AUC:", metrics.roc_auc_score(Yva, abc.predict_proba(XvaP)[:,1]))

Training AUC: 0.8733796578416171
Validation AUC: 0.592908080301066


In [18]:
num_leaves = [3300, 3650, 4000, 4300, 4650]
min_child_samples = [1, 2, 4]
for n in num_leaves:
    for m in min_child_samples:
        lgbm = LGBMClassifier(num_leaves=n, learning_rate=0.01, min_child_samples=m, n_estimators=200)
        lgbm.fit(XtrP, Ytr)
        print(n,m,"Validation AUC:", metrics.roc_auc_score(Yva, lgbm.predict_proba(XvaP)[:,1]))

# max_depths = [50, 75, 100, 125]
# min_child_samples = [1, 2, 4]
# for n in num_leaves:
#     for d in max_depths:
#         for m in min_child_samples:
#             lgbm = LGBMClassifier(num_leaves=n, max_depth=d, learning_rate=0.01, min_child_samples=m, n_estimators=200)
#             lgbm.fit(XtrP, Ytr)
#             print(n,d,m,"Validation AUC:", metrics.roc_auc_score(Yva, lgbm.predict_proba(XvaP)[:,1]))

3300 1 Validation AUC: 0.7453917092654359
3300 2 Validation AUC: 0.745573718810639
3300 4 Validation AUC: 0.7465564434226696
3650 1 Validation AUC: 0.7465815704529699
3650 2 Validation AUC: 0.7458284769860685
3650 4 Validation AUC: 0.7474170152368369
4000 1 Validation AUC: 0.7464836396045883
4000 2 Validation AUC: 0.7446896866393868
4000 4 Validation AUC: 0.7477460176333585
4300 1 Validation AUC: 0.7443780463419922
4300 2 Validation AUC: 0.7456217929509745
4300 4 Validation AUC: 0.7458821306185265
4650 1 Validation AUC: 0.7448582800170827
4650 2 Validation AUC: 0.7445049287390135
4650 4 Validation AUC: 0.7455503412353364


In [19]:
lgbm = LGBMClassifier(num_leaves=1000, learning_rate=0.01, min_child_samples=1, n_estimators=2000)
lgbm.fit(XtrP, Ytr)
print("Training AUC:", metrics.roc_auc_score(Ytr, lgbm.predict_proba(XtrP)[:,1]))
print("Validation AUC:", metrics.roc_auc_score(Yva, lgbm.predict_proba(XvaP)[:,1]))

Training AUC: 0.9771825304995541
Validation AUC: 0.7591105159309456


    num_leaves=4000, min_child_samples=1: 0.9822267654079306, 0.7880581959019618


## Neural Networks

In [20]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

n_features = XtrP.shape[1]
# define model
nn = Sequential()
nn.add(Dense(500, activation='relu', kernel_initializer='he_normal', input_shape=(n_features,)))
nn.add(Dense(100, activation='relu', kernel_initializer='he_normal'))
nn.add(Dense(10, activation='relu', kernel_initializer='he_normal'))
nn.add(Dense(1, activation='sigmoid'))
# compile the model
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
nn.fit(XtrP, Ytr, epochs=200, batch_size=128, verbose=2)

Epoch 1/200
1407/1407 - 1s - loss: 0.5997 - accuracy: 0.6907
Epoch 2/200
1407/1407 - 1s - loss: 0.5888 - accuracy: 0.6993
Epoch 3/200
1407/1407 - 1s - loss: 0.5841 - accuracy: 0.7030
Epoch 4/200
1407/1407 - 1s - loss: 0.5810 - accuracy: 0.7047
Epoch 5/200
1407/1407 - 1s - loss: 0.5772 - accuracy: 0.7073
Epoch 6/200
1407/1407 - 1s - loss: 0.5747 - accuracy: 0.7095
Epoch 7/200
1407/1407 - 1s - loss: 0.5723 - accuracy: 0.7097
Epoch 8/200
1407/1407 - 1s - loss: 0.5699 - accuracy: 0.7120
Epoch 9/200
1407/1407 - 1s - loss: 0.5676 - accuracy: 0.7142
Epoch 10/200
1407/1407 - 1s - loss: 0.5653 - accuracy: 0.7147
Epoch 11/200
1407/1407 - 1s - loss: 0.5637 - accuracy: 0.7160
Epoch 12/200
1407/1407 - 1s - loss: 0.5611 - accuracy: 0.7166
Epoch 13/200
1407/1407 - 1s - loss: 0.5592 - accuracy: 0.7184
Epoch 14/200
1407/1407 - 1s - loss: 0.5570 - accuracy: 0.7194
Epoch 15/200
1407/1407 - 1s - loss: 0.5554 - accuracy: 0.7211
Epoch 16/200
1407/1407 - 1s - loss: 0.5534 - accuracy: 0.7215
Epoch 17/200
1407

<tensorflow.python.keras.callbacks.History at 0x7f925a072430>

In [34]:
print("Training AUC:", metrics.roc_auc_score(Ytr, nn.predict_proba(XtrP)))
print("Validation AUC:", metrics.roc_auc_score(Yva, nn.predict_proba(XvaP)))

Training AUC: 0.8460315351343201
Validation AUC: 0.7470837026688537


## Ensemble

In [53]:
abc_pred = abc.predict_proba(XvaP)[:,1]
rf_pred = rf.predict_proba(XvaP)[:,1]
neigh_pred = neigh.predict_proba(XvaKNN)[:,1]
nn_pred = nn.predict_proba(XvaP)

XvaStack = np.column_stack((abc_pred, rf_pred, neigh_pred, nn_pred))

In [54]:
print("Validation AUC lgbm:", metrics.roc_auc_score(Yva, lgbm_pred))
print("Validation AUC rf:", metrics.roc_auc_score(Yva, rf_pred))
print("Validation AUC neigh:", metrics.roc_auc_score(Yva, neigh_pred))
print("Validation AUC nn:", metrics.roc_auc_score(Yva, nn_pred))
print("Validation AUC:", metrics.roc_auc_score(Yva, stacked.predict_proba(XvaStack)[:,1]))

Validation AUC lgbm: 0.7591105159309456
Validation AUC rf: 0.7644580645372071
Validation AUC neigh: 0.731799663583667
Validation AUC nn: 0.7470837026688537
Validation AUC: 0.7750211579197815


In [51]:
lgbm_pred = lgbm.predict_proba(XvaP)[:,1]
rf_pred = rf.predict_proba(XvaP)[:,1]
neigh_pred = neigh.predict_proba(XvaKNN)[:,1]
nn_pred = nn.predict_proba(XvaP)

In [52]:
XvaStack = np.column_stack((lgbm_pred, rf_pred, neigh_pred, nn_pred))
for c in [.1, 1, 10, 100, 1000, 10000, 1000000]:
    stacked = LogisticRegression(max_iter=1000, C=c)
    stacked.fit(XvaStack, Yva)
    print(c, "Validation AUC:", metrics.roc_auc_score(Yva, stacked.predict_proba(XvaStack)[:,1]))

0.1 Validation AUC: 0.7776763118406956
1 Validation AUC: 0.7776579453226108
10 Validation AUC: 0.7776378348683136
100 Validation AUC: 0.7776336405913471
1000 Validation AUC: 0.7776331218255118
10000 Validation AUC: 0.7776329783370893
1000000 Validation AUC: 0.7776330224873731


In [31]:
stacked = LogisticRegression(max_iter=1000, C=100000)
stacked.fit(XvaStack, Yva)

LogisticRegression(C=100000, max_iter=1000)

## Submission

In [44]:
Xte = np.genfromtxt('X_test.txt', delimiter=None)
XteP, params = ml.rescale(Xte)
XteKNN = XteP[:, 0]
for i in range(1, 14):
    if i not in [3, 13]:
        XteKNN = np.column_stack((XteKNN, XteP[:,i]))
XteKNN = np.column_stack((XteKNN, np.multiply(-XteP[:,3], XteP[:,13])))

In [45]:
lgbm_pred = lgbm.predict_proba(XteP)[:,1]
rf_pred = rf.predict_proba(XteP)[:,1]
neigh_pred = neigh.predict_proba(XteKNN)[:,1]
nn_pred = nn.predict_proba(XteP)

XteStack = np.column_stack((lgbm_pred, rf_pred, neigh_pred, nn_pred))

In [46]:
Yte = stacked.predict_proba(XteStack)[:,1]
Y_sub = np.vstack([np.arange(Xte.shape[0]), Yte]).T
np.savetxt('Y_submit.txt',Y_sub,'%d,%.10f',header='ID,Prob1',comments='',delimiter=',')

In [47]:
XtrP.shape

(180000, 14)