In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.metrics import f1_score, make_scorer
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import KernelPCA
from imblearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
def lgbm_f1score(X_train, y_train, X_test, y_test):
    lgbm = LGBMClassifier(random_state=2022)
    lgbm.fit(X_train, y_train)
    y_pred = lgbm.predict(X_test)
    print(f"f1_score: {f1_score(y_test, y_pred)}")

In [3]:
X = pd.read_csv('../data/raw/train_data.csv')
y = pd.read_csv('../data/raw/train_labels.csv')
y = y.replace(-1, 0)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, np.ravel(y), stratify=y, test_size=0.2, shuffle=True)

In [None]:
lgbm_f1score(X_train, y_train, X_test)

In [5]:
scaler = MinMaxScaler(clip=True, feature_range=(-1.0, 1.0))
scaler.fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

In [None]:
lgbm_f1score(X_train, y_train, X_test)

In [6]:
sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
lgbm_f1score(X_train, y_train, X_test)

In [None]:
pipeline = Pipeline([("kpca", KernelPCA()),
                     ("rf", LGBMClassifier())])

param_grid = [{
        "kpca__n_components": [20, 50, 70, 90 , 120, 200],
        "kpca__gamma": np.linspace(0.03, 0.05, 2),
        "kpca__kernel": ["rbf", "sigmoid", "linear", "poly"]
    }]

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring=make_scorer(f1_score, average='micro'), verbose=10)
grid_search.fit(X_train, y_train)

In [12]:
grid_search.best_params_

{'kpca__gamma': 0.03, 'kpca__n_components': 200}

In [15]:
grid_search.best_score_

0.9962953698556811

In [36]:
kpca = KernelPCA(n_components=90, gamma=0.03, kernel='linear')
kpca.fit(X_train, y_test)
#X_train, X_test, y_train, y_test = train_test_split(X, np.ravel(y), stratify=y, test_size=0.2, shuffle=True)
X_train = kpca.transform(X_train)
X_test = kpca.transform(X_test)

lgbm = LGBMClassifier(random_state=2022)
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
print(f"f1_score: {f1_score(y_test, y_pred)}")


f1_score: 0.9473684210526316


In [10]:
lgbm_f1score(X_train, y_train, X_test, y_test)

array([1, 1, 1, ..., 0, 0, 0])

In [38]:
from hpsklearn import HyperoptEstimator, any_classifier, any_preprocessing, lightgbm_classification

estim = HyperoptEstimator(classifier=any_classifier('cla'), preprocessing=any_preprocessing('pre'),max_evals=150,trial_timeout=60)
estim.fit(X_train, y_train)
estim.best_model()

100%|██████████| 1/1 [00:00<00:00,  2.30trial/s, best loss: 0.23888888888888893]
100%|██████████| 2/2 [00:01<00:00,  1.69s/trial, best loss: 0.23888888888888893]
100%|██████████| 3/3 [00:00<00:00,  3.24trial/s, best loss: 0.11111111111111116]
100%|██████████| 4/4 [00:01<00:00,  1.63s/trial, best loss: 0.11111111111111116]
100%|██████████| 5/5 [00:00<00:00,  2.35trial/s, best loss: 0.04629629629629628]
100%|██████████| 6/6 [00:00<00:00, 14.62trial/s, best loss: 0.04629629629629628]
100%|██████████| 7/7 [00:01<00:00,  1.54s/trial, best loss: 0.04629629629629628]
100%|██████████| 8/8 [00:00<00:00,  6.35trial/s, best loss: 0.04629629629629628]
100%|██████████| 9/9 [00:23<00:00, 23.83s/trial, best loss: 0.04629629629629628]
100%|██████████| 10/10 [00:01<00:00,  1.43s/trial, best loss: 0.04629629629629628]
100%|██████████| 11/11 [00:07<00:00,  7.41s/trial, best loss: 0.04629629629629628]
100%|██████████| 12/12 [00:01<00:00,  1.95s/trial, best loss: 0.04629629629629628]
100%|██████████| 13/13

{'learner': KNeighborsClassifier(algorithm='brute', leaf_size=39, metric='manhattan',
                      n_jobs=1, n_neighbors=7, p=3.198659538429792,
                      weights='distance'),
 'preprocs': (Normalizer(norm='l1'),),
 'ex_preprocs': ()}

In [41]:
from sklearn.neighbors import KNeighborsClassifier

from Normalizer

lgbm = KNeighborsClassifier(algorithm='brute', leaf_size=39, metric='manhattan',
                      n_jobs=1, n_neighbors=7, p=3.198659538429792,
                      weights='distance')

lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
print(f"f1_score: {f1_score(y_test, y_pred)}")

f1_score: 0.9473684210526316


In [7]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np

pca_99 = PCA(n_components=100)
fit = pca_99.fit(X_train)
X_train = fit.transform(X_train)
X_test = fit.transform(X_test)

#tsne = TSNE(init='pca', n_components=2, learning_rate=100)
#X_train = tsne.fit_transform(X_train)
#X_test = tsne.fit_transform(X_test)

lgbm = KNeighborsClassifier(algorithm='brute', leaf_size=39, metric='manhattan',
                      n_jobs=1, n_neighbors=7, p=3.198659538429792,
                      weights='distance')

lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
print(f"f1_score: {f1_score(y_test, y_pred)}")

f1_score: 0.9473684210526316
