In [2]:
import numpy as np
import pandas as pd
# library for splitting training-testing
from sklearn.model_selection import train_test_split
# library for classification
from sklearn.metrics import accuracy_score

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline
from lightgbm import LGBMClassifier

from hyperopt import tpe,hp,Trials
from hyperopt.fmin import fmin
import warnings

In [3]:
x_train = pd.read_feather("Data/x_train__IterativeImputer_genFeats.ftr")
x_test = pd.read_feather("Data/x_test__IterativeImputer_genFeats.ftr")

x_tr = pd.read_csv("Preds/preds_train.csv")
x_val_1 = pd.read_csv("Preds/preds_val.csv")
x_test_1 = pd.read_csv("Preds/preds_test.csv")

y_train = pd.read_csv("Data/train_y.csv")
y_test = pd.read_csv("Data/Sample_Output.csv", names = ['ID', 'Target'], header=None)

In [4]:
x_train_1, x_val, y_train_1, y_val = train_test_split(x_train, y_train, test_size=0.15, random_state=59, stratify=y_train)

In [5]:
x_train_1 = pd.concat([x_train_1.reset_index(drop=True), x_tr], axis=1)
x_val = pd.concat([x_val.reset_index(drop=True), x_val_1], axis=1)
x_test = pd.concat([x_test.reset_index(drop=True), x_test_1], axis=1)

In [8]:
pipe = pipeline.Pipeline([('scl', StandardScaler()), 
                         ('qda', QuadraticDiscriminantAnalysis())],
                       verbose=True,
                       )

In [9]:
pipe.fit(x_tr,y_train_1.values.reshape(-1))

[Pipeline] ............... (step 1 of 2) Processing scl, total=   0.3s




[Pipeline] ............... (step 2 of 2) Processing qda, total=   1.1s


Pipeline(steps=[('scl', StandardScaler()),
                ('qda', QuadraticDiscriminantAnalysis())],
         verbose=True)

In [11]:
preds_tr = pipe.predict(x_tr,)
preds_val = pipe.predict(x_val_1)

In [12]:
accuracy_score(y_train_1, preds_tr)

0.2074723899957957

In [13]:
accuracy_score(y_val, preds_val)

0.21170300855644494

In [None]:
pipe.predict()

In [14]:
model = LGBMClassifier(**bestParams)
    
model.fit(x_train_1, 
          y_train_1.values.reshape(-1), 
          eval_set=[(x_train_1, y_train_1.values.reshape(-1)), (x_val, y_val.values.reshape(-1))], 
          verbose=1, early_stopping_rounds=10)

preds_tr = model.predict(x_train_1,)
preds_val = model.predict(x_val)
val_score = accuracy_score(y_val, preds_val)
tr_score = accuracy_score(y_train_1, preds_tr)
print(tr_score, val_score)



[1]	valid_0's multi_logloss: 0.809338	valid_1's multi_logloss: 0.809506
[2]	valid_0's multi_logloss: 0.773751	valid_1's multi_logloss: 0.774106
[3]	valid_0's multi_logloss: 0.744055	valid_1's multi_logloss: 0.744591
[4]	valid_0's multi_logloss: 0.718559	valid_1's multi_logloss: 0.71929
[5]	valid_0's multi_logloss: 0.696536	valid_1's multi_logloss: 0.697507
[6]	valid_0's multi_logloss: 0.677077	valid_1's multi_logloss: 0.678234
[7]	valid_0's multi_logloss: 0.659963	valid_1's multi_logloss: 0.661316
[8]	valid_0's multi_logloss: 0.644045	valid_1's multi_logloss: 0.645566
[9]	valid_0's multi_logloss: 0.629871	valid_1's multi_logloss: 0.631601
[10]	valid_0's multi_logloss: 0.617197	valid_1's multi_logloss: 0.619105
[11]	valid_0's multi_logloss: 0.60545	valid_1's multi_logloss: 0.60754
[12]	valid_0's multi_logloss: 0.594796	valid_1's multi_logloss: 0.597066
[13]	valid_0's multi_logloss: 0.584992	valid_1's multi_logloss: 0.587412
[14]	valid_0's multi_logloss: 0.575775	valid_1's multi_logloss:

[114]	valid_0's multi_logloss: 0.425519	valid_1's multi_logloss: 0.45266
[115]	valid_0's multi_logloss: 0.425133	valid_1's multi_logloss: 0.452572
[116]	valid_0's multi_logloss: 0.424785	valid_1's multi_logloss: 0.452483
[117]	valid_0's multi_logloss: 0.424411	valid_1's multi_logloss: 0.452387
[118]	valid_0's multi_logloss: 0.424037	valid_1's multi_logloss: 0.452296
[119]	valid_0's multi_logloss: 0.42368	valid_1's multi_logloss: 0.452195
[120]	valid_0's multi_logloss: 0.423339	valid_1's multi_logloss: 0.452136
[121]	valid_0's multi_logloss: 0.422981	valid_1's multi_logloss: 0.452047
[122]	valid_0's multi_logloss: 0.422626	valid_1's multi_logloss: 0.451973
[123]	valid_0's multi_logloss: 0.422282	valid_1's multi_logloss: 0.45189
[124]	valid_0's multi_logloss: 0.421934	valid_1's multi_logloss: 0.451832
[125]	valid_0's multi_logloss: 0.421609	valid_1's multi_logloss: 0.451761
[126]	valid_0's multi_logloss: 0.421243	valid_1's multi_logloss: 0.451692
[127]	valid_0's multi_logloss: 0.420899	v

In [10]:
probs_tr = model.predict_proba(x_train_1)
probs_val = model.predict_proba(x_val)

In [11]:
np.save("Preds/LGB_val", probs_val)
np.save("Preds/LGB_tr", probs_tr)

probs_test = model.predict_proba(x_test)
np.save("Preds/LGB_test", probs_test)