In [49]:
import os
from collections import Counter
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, RepeatedStratifiedKFold, train_test_split
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from hpsklearn import HyperoptEstimator, any_classifier, any_preprocessing
from hyperopt import tpe

import neptune.new as neptune

ModuleNotFoundError: No module named 'imblearn'

In [17]:
import neptune.new as neptune

run = neptune.init(
    project="julia.grzegorowska/ml-project",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI0YzMwMDM2OC04YzdlLTQxOGEtYmEzYi0xZTA3ZmQzMjlkNzIifQ==",
)

params = {
    "optimizer": "Julia"
}
run["parameters"] = params


def send_data_neptune(data, plot_name):
    for epoch in range(0, len(data)):
      run[plot_name].log(data[epoch])

def single_record(record, record_name):
    run[record_name] = record

def stop_run():
    run.stop()

https://app.neptune.ai/julia.grzegorowska/ml-project/e/MLPROJ-15
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [18]:
os.environ["OMP_NUM_THREADS"] = "1"

In [19]:
seed = np.random.seed(147)

In [24]:
def data_load() -> list:

    train_data = pd.read_csv("data/train_data.csv", header=None)
    test_data = pd.read_csv("data/test_data.csv", header=None)
    train_labels = pd.read_csv("data/train_labels.csv", header=None)
    

    ## Save to neptune train labels
    a = train_labels.values
    tmp = []
    for i in range(0, len(a)):
      tmp.append(int(a[i]))

    send_data_neptune(tmp, "train_labels")


    return [train_data, test_data, train_labels]

In [25]:
train_data, test_data, train_labels = data_load()

In [26]:
train_labels_ravel = train_labels.values.ravel()

In [27]:
def pipeline_standard_minmax(X_1, X_2: pd.DataFrame) -> np.array:

    pipeline = Pipeline([
    ("std", StandardScaler()),
    ("minmax", MinMaxScaler())])
    
    train_std_minmax = pipeline.fit_transform(X_1)
    test_std_minmax = pipeline.fit_transform(X_2)
    
    return [train_std_minmax, train_std_minmax]

In [28]:
train_std_minmax, test_std_minmax = pipeline_standard_minmax(train_data, test_data)

In [29]:
k=int(len(train_data.columns)/3)

In [39]:
def kbest_select(x_1: np.array, x_2: np.array, y_1: np.array, n_of_kbest: int) -> list:
    print(f"Shape before: {x_1.shape}\n")
    
    test = SelectKBest(score_func=f_classif, k=n_of_kbest)
    fit = test.fit(x_1, y_1)
    features_first = fit.transform(x_1)
    features_second = fit.transform(x_2)
    
    scores = fit.scores_
    score_df = pd.DataFrame(scores, columns=["Scores"])
    print(f"Min score: {min(score_df.Scores)}, max score: {max(score_df.Scores)}, mean score: {np.mean(score_df.Scores)}\n")    
    print(f"Shape after: {features_first.shape}\n")
    
    score_df.drop(score_df[score_df.Scores < 1].index, inplace=True)
    l = len(score_df)

    ## Save to Neptune
    single_record(min(score_df.Scores), 'kbest_select_min_score')
    single_record(max(score_df.Scores), 'kbest_select_max_score')
    single_record(np.mean(score_df.Scores), 'kbest_select_mean_score')
    
    if l != n_of_kbest:
        return kbest_select(train_std_minmax, test_std_minmax, train_labels_ravel, l)
    else:
        return [features_first, features_second]

In [40]:
kbest_train, kbest_test = kbest_select(train_std_minmax, test_std_minmax, train_labels_ravel,k)

Shape before: (3750, 10000)

Min score: 1.7567042700994732e-09, max score: 17.321255892491074, mean score: 1.000380011727585

Shape after: (3750, 3333)

Shape before: (3750, 10000)

Min score: 1.7567042700994732e-09, max score: 17.321255892491074, mean score: 1.000380011727585

Shape after: (3750, 3177)



In [43]:
def pca_select(x_1, x_2: np.array) -> np.array:
    
    print(f"Shape before transforamtion: {x_1.shape}\n")
    
    scaler = MinMaxScaler(feature_range = (0, 1))
    pca = PCA(n_components=100, random_state=seed)
    fit = pca.fit(x_1)
    features_first = fit.transform(x_1)
    features_second = fit.transform(x_2)
    
    print(f"Explained Variance: \n{fit.explained_variance_ratio_}\n")
    print(f"Shape after transormation: {features_first.shape}")

    send_data_neptune(fit.explained_variance_ratio_, "explained_variance_ration")
    
    return [features_first, features_second]

In [44]:
pca_train, pca_test = pca_select(kbest_train, kbest_test)

Shape before transforamtion: (3750, 3177)

Explained Variance: 
[0.00119912 0.00115652 0.00114973 0.00114476 0.00114238 0.00113942
 0.00113693 0.00112986 0.00112779 0.00112082 0.0011182  0.00111573
 0.00111269 0.00111089 0.00110794 0.00110688 0.00109969 0.00109535
 0.00109378 0.00109121 0.00108836 0.00108419 0.00108334 0.00108155
 0.00107696 0.0010749  0.00107404 0.0010718  0.00106935 0.001067
 0.00106293 0.00106    0.00105852 0.00105635 0.0010534  0.00105165
 0.00105014 0.00104801 0.00104623 0.00104508 0.00104468 0.00103807
 0.00103631 0.00103511 0.00103434 0.00102916 0.0010273  0.001026
 0.00102531 0.00102077 0.00101904 0.00101632 0.00101535 0.00101456
 0.0010118  0.00100976 0.00100892 0.0010047  0.00100352 0.00099885
 0.00099852 0.00099734 0.00099596 0.00099475 0.00099243 0.00099103
 0.00098759 0.00098466 0.00098348 0.00097789 0.00097721 0.00097503
 0.00097383 0.00097047 0.00096941 0.00096651 0.000966   0.00096494
 0.00096335 0.00096063 0.00095943 0.00095832 0.00095427 0.0009528
 0.

In [47]:
def rfe_select(X_1, X_2, y_1: np.array) -> np.array:
    
    print(f"Shape before transformation: {X_1.shape}\n")
    
    model = LogisticRegression()
    svc = SVC(kernel="linear", C=1, random_state=seed)
    rfe = RFE(estimator=svc, n_features_to_select=5)
    fit = rfe.fit(X_1, y_1)
    first_features = fit.transform(X_1)
    second_features = fit.transform(X_2)
    
    print(f"Feature Ranking: \n{fit.ranking_}\n")
    print(f"Shape after: {first_features.shape}\n")

    send_data_neptune(fit.ranking_, "fit-ranking")
    
    return [first_features, second_features]

In [48]:
rfe_train, rfe_test = rfe_select(pca_train, pca_test, train_labels_ravel)

Shape before transformation: (3750, 100)

Feature Ranking: 
[ 1  1  1 48 24 50  1 72 81  7 60 21 19 91  4  6 73 32 22  2 63 18 89  9
 65 39 53 64 11 42 12 66 16 40 61 33 77 45 15 87 82 62  1 68 20 23 95 34
 74 44 67 47 94 35 56 25 80 86 41 75 49 52 10 83  5 84  8 71 26 31 96 76
 69 79 59 55 28 85 29 27 88 90 57 30  3 58 14 13 38 43 37 46 78 36 17 92
 70 93 51 54]

Shape after: (3750, 5)



In [56]:
def random_sampling(x_1: np.array, y_1: np.array) -> list:
    over = RandomOverSampler(sampling_strategy=0.2)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)

    x_resampled, y_resampled = pipeline.fit_resample(x_1, y_1)

    tmp_X = []
    for i in range(0, len(x_resampled)):
        for j in range(0, len(x_resampled[i])):
            tmp_X.append(x_resampled[i][j])

    send_data_neptune(tmp_X, "X-resampled")
    send_data_neptune(y_resampled, "y-resampled")

    return [x_resampled, y_resampled]

In [57]:
x_resampled, y_resampled = random_sampling(rfe_train, train_labels_ravel)


NameError: name 'RandomOverSampler' is not defined

In [58]:
def save_data(train_x: np.array, test_x: np.array, train_y: np.array) -> None:

    np.save('project_data/processed_train_X.npy', train_x)
    np.save('project_data/processed_test_X.npy', test_x)
    np.save('project_data/processed_train_y.npy', train_y)
    
    print("Saving has been completed.")

In [59]:
save_data(x_resampled, rfe_test, y_resampled)

NameError: name 'x_resampled' is not defined