In [1]:
from google.colab import files
src = list(files.upload().values())[0]
open('handling_outliers.py','wb').write(src)

Saving handling_outliers.py to handling_outliers.py


1844

In [2]:
#imports
import pandas as pd
import pprint
import numpy as np
from handling_outliers import removing_iqr, removing_percentiles, zscore_outlier, modified_z_score_outlier, count_outliers, mask_outliers, replace_missing_values
from sklearn.preprocessing import RobustScaler, binarize
from sklearn.dummy import DummyClassifier
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [3]:
#loading
X = pd.read_csv('/content/drive/MyDrive/train_data.csv', header = None)
y = pd.read_csv('/content/drive/MyDrive/train_labels.csv', header = None)

In [None]:
#outliers
X_rem = removing_iqr(pd.DataFrame(X))
counted = count_outliers(X_rem)
contamination = counted[0].sum() / (X_rem.shape[0] * X_rem.shape[1]) * 100
masked = mask_outliers(pd.DataFrame(X), X_rem)
replaced = replace_missing_values(masked, 5)
print(counted)
print(f'Dataset contamination: {round(contamination, 2)} %')
print('_'*10)

In [5]:
#baseline
methods = ["most_frequent", "prior", "stratified", "uniform"]
scores = {}

for method in methods:
  dummy_clf = DummyClassifier(strategy=method)
  dummy_clf.fit(X, y)
  y_pred = dummy_clf.predict(X)
  precision = precision_score(y, y_pred)
  recall = recall_score(y, y_pred)
  acc = balanced_accuracy_score(y, y_pred)
  f1 = f1_score(y, y_pred, average='weighted')
  roc_score = roc_auc_score(y, y_pred, average='weighted')
  scores[method] = {'acc':dummy_clf.score(X, y), 
                    'precision':precision, 
                    'balanced accuracy':acc,
                    'recall':recall, 
                    'accuracy_balanced':acc, 
                    'f1_weighted':f1, 
                    'roc_score':roc_score}

pprint.pprint(scores)

{'most_frequent': {'acc': 0.9,
                   'accuracy_balanced': 0.5,
                   'balanced accuracy': 0.5,
                   'f1_weighted': 0.8526315789473684,
                   'precision': 0.9,
                   'recall': 1.0,
                   'roc_score': 0.5},
 'prior': {'acc': 0.9,
           'accuracy_balanced': 0.5,
           'balanced accuracy': 0.5,
           'f1_weighted': 0.8526315789473684,
           'precision': 0.9,
           'recall': 1.0,
           'roc_score': 0.5},
 'stratified': {'acc': 0.8173333333333334,
                'accuracy_balanced': 0.493037037037037,
                'balanced accuracy': 0.493037037037037,
                'f1_weighted': 0.8204082133086594,
                'precision': 0.8986188657067293,
                'recall': 0.906074074074074,
                'roc_score': 0.49303703703703705},
 'uniform': {'acc': 0.48106666666666664,
             'accuracy_balanced': 0.5103703703703704,
             'balanced accuracy': 0.510370

In [None]:
scaler = RobustScaler()
X = scaler.fit_transform(X)

In [4]:
#encoding
y = pd.DataFrame(binarize(y))

In [None]:
#dimensionality reduction using PCA
pca = PCA(n_components=0.95, whiten=True)
X = pca.fit_transform(X)
X.shape

In [5]:
#dimensionality reduction using kPCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe = Pipeline(
    steps=[
           ('kpca', KernelPCA(n_components = 1012)),
           ("classifier", LogisticRegression())
           ]
    )

In [8]:
model_params = [{"kpca__gamma" : np.linspace(0.03, 0.05, 2),
                 "kpca__kernel" : [
                                   #'linear',
                                  'poly'
                                  , 'rbf'
                                  , 'sigmoid'
                                  , 'cosine'
                                  # , 'precomputed'
                 ]},
                # {"classifier" : [LogisticRegression()],
                #  "classifier__solver" : ['sag', 'saga', 'liblinear'],
                # "classifier__penalty" : ['l1', 'l2', 'elasticnet', 'none'],
                # "classifier__C" : [np.logspace(1, 4, 10)],
                # "classifier__class_weight" : ['balanced', {"0" : 0.1, "1" : 0.9}]},
                # {"classifier" : [BernoulliNB()],
                # "classifier__alpha" : [np.linspace(1, 10, 100)]},
                {"classifier" : [RandomForestClassifier()],
                "classifier__n_estimators" : [10, 100, 1000],
                "classifier__criterion" : ['gini', 'entropy', 'log_loss'],
                "classifier__max_features" : ['auto', 'sqrt', 'log2', 'none'],
                "classifier__class_weight" : ['balanced', [{0 : 1}, {1 : 9}]]},
                {"classifier" : [KNeighborsClassifier()],
                "classifier__weights" : ['uniform', 'distance'],
                "classifier__algorithm" : ['auto', 'kd-tree', 'brute'],
                "classifier__n_neighbors" : [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]},
                # {"classifier" : [SVC()],
                # "classifier__C" : [np.logspace(1, 10000, 1000)],
                # "classifier__class_weight" : ['balanced', [{0 : 1}, {1 : 9}]]},
                # {"classifier" : [MLPClassifier()]},
                {"classifier" : [DecisionTreeClassifier()],
                "classifier__criterion" : ['gini', 'entropy', 'log_loss'],
                "classifier__splitter" : ['best', 'random'],
                "classifier__max_features" : ['auto', 'sqrt', 'log2', 'none'],
                "classifier__class_weight" : ['balanced', [{0 : 1}, {1 : 9}]]},
                # {"classifier" : [ExtraTreesClassifier()],
                # "classifier__n_estimators" : [10, 100, 1000],
                # "classifier__criterion" : ['gini', 'entropy', 'log_loss'],
                # "classifier__max_features" : ['sqrt', 'log2', 'none'],
                # "classifier__class_weight" : ['balanced', [{0 : 1}, {1 : 9}]]}
                ]

In [13]:
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True)

#SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_sm, y_sm = sm.fit_resample(X_train, y_train)

metrics = []

search = GridSearchCV(pipe, model_params, scoring="balanced_accuracy", refit=True, verbose=3)

In [14]:
result = search.fit(X_sm, y_sm.values.flatten())

Fitting 5 folds for each of 188 candidates, totalling 940 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 1/5] END kpca__gamma=0.03, kpca__kernel=poly;, score=0.856 total time=  33.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 2/5] END kpca__gamma=0.03, kpca__kernel=poly;, score=0.869 total time=  29.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 3/5] END kpca__gamma=0.03, kpca__kernel=poly;, score=0.864 total time=  28.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 4/5] END kpca__gamma=0.03, kpca__kernel=poly;, score=0.876 total time=  28.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 5/5] END kpca__gamma=0.03, kpca__kernel=poly;, score=0.877 total time=  28.1s
[CV 1/5] END kpca__gamma=0.03, kpca__kernel=rbf;, score=0.500 total time=  19.0s
[CV 2/5] END kpca__gamma=0.03, kpca__kernel=rbf;, score=0.500 total time=  18.5s
[CV 3/5] END kpca__gamma=0.03, kpca__kernel=rbf;, score=0.500 total time=  19.1s
[CV 4/5] END kpca__gamma=0.03, kpca__kernel=rbf;, score=0.500 total time=  18.1s
[CV 5/5] END kpca__gamma=0.03, kpca__kernel=rbf;, score=0.500 total time=  18.6s
[CV 1/5] END kpca__gamma=0.03, kpca__kernel=sigmoid;, score=0.769 total time=  26.3s
[CV 2/5] END kpca__gamma=0.03, kpca__kernel=sigmoid;, score=0.758 total time=  27.3s
[CV 3/5] END kpca__gamma=0.03, kpca__kernel=sigmoid;, score=0.785 total time=  26.4s
[CV 4/5] END kpca__gamma=0.03, kpca__kernel=sigmoid;, score=0.772 total time=  29.0s
[CV 5/5] END kpca__gamma=0.03, kpca__kernel=sigmoid;, score=0.775 total time=  27.6s
[CV 1/5] END kpca__gamma=0.03, kpca__kernel=cosine;, score=0.743 total time=  28.1s
[CV 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 1/5] END kpca__gamma=0.05, kpca__kernel=poly;, score=0.863 total time=  29.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 2/5] END kpca__gamma=0.05, kpca__kernel=poly;, score=0.864 total time=  29.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 3/5] END kpca__gamma=0.05, kpca__kernel=poly;, score=0.863 total time=  28.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 4/5] END kpca__gamma=0.05, kpca__kernel=poly;, score=0.877 total time=  33.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 5/5] END kpca__gamma=0.05, kpca__kernel=poly;, score=0.884 total time=  30.7s
[CV 1/5] END kpca__gamma=0.05, kpca__kernel=rbf;, score=0.500 total time=  20.5s
[CV 2/5] END kpca__gamma=0.05, kpca__kernel=rbf;, score=0.500 total time=  19.0s
[CV 3/5] END kpca__gamma=0.05, kpca__kernel=rbf;, score=0.500 total time=  19.7s
[CV 4/5] END kpca__gamma=0.05, kpca__kernel=rbf;, score=0.500 total time=  19.7s
[CV 5/5] END kpca__gamma=0.05, kpca__kernel=rbf;, score=0.500 total time=  18.8s
[CV 1/5] END kpca__gamma=0.05, kpca__kernel=sigmoid;, score=0.769 total time=  28.3s
[CV 2/5] END kpca__gamma=0.05, kpca__kernel=sigmoid;, score=0.758 total time=  27.2s
[CV 3/5] END kpca__gamma=0.05, kpca__kernel=sigmoid;, score=0.785 total time=  27.0s
[CV 4/5] END kpca__gamma=0.05, kpca__kernel=sigmoid;, score=0.772 total time=  26.2s
[CV 5/5] END kpca__gamma=0.05, kpca__kernel=sigmoid;, score=0.775 total time=  28.1s
[CV 1/5] END kpca__gamma=0.05, kpca__kernel=cosine;, score=0.743 total time=  27.5s
[CV 

550 fits failed out of a total of 940.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/us

In [15]:
import joblib
file = 'grid_search_model_smote.sav'
joblib.dump(result, filename = file)

['grid_search_model_smote.sav']

In [22]:
search.best_score_, search.best_params_

(0.9920983698575672,
 {'classifier': RandomForestClassifier(class_weight='balanced', max_features='log2',
                         n_estimators=10),
  'classifier__class_weight': 'balanced',
  'classifier__criterion': 'gini',
  'classifier__max_features': 'log2',
  'classifier__n_estimators': 10})

In [24]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kpca__gamma,param_kpca__kernel,param_classifier,param_classifier__class_weight,param_classifier__criterion,param_classifier__max_features,...,param_classifier__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,26.358068,2.039010,3.254061,0.067972,0.03,poly,,,,,...,,"{'kpca__gamma': 0.03, 'kpca__kernel': 'poly'}",0.855973,0.868600,0.863636,0.876482,0.877470,0.868432,0.008059,72
1,15.621855,0.363864,3.022769,0.019582,0.03,rbf,,,,,...,,"{'kpca__gamma': 0.03, 'kpca__kernel': 'rbf'}",0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.000000,77
2,24.062105,0.492367,3.271100,0.763097,0.03,sigmoid,,,,,...,,"{'kpca__gamma': 0.03, 'kpca__kernel': 'sigmoid'}",0.769046,0.758092,0.784585,0.771739,0.774704,0.771633,0.008570,73
3,24.399832,0.473915,3.137638,0.040823,0.03,cosine,,,,,...,,"{'kpca__gamma': 0.03, 'kpca__kernel': 'cosine'}",0.742516,0.736260,0.766798,0.761858,0.771739,0.755834,0.013928,75
4,27.122331,1.803361,3.232087,0.021268,0.05,poly,,,,,...,,"{'kpca__gamma': 0.05, 'kpca__kernel': 'poly'}",0.862894,0.863664,0.862648,0.877470,0.884387,0.870213,0.009025,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,21.270841,0.330888,0.000000,0.000000,,,DecisionTreeClassifier(),"[{0: 1}, {1: 9}]",log_loss,sqrt,...,random,"{'classifier': DecisionTreeClassifier(), 'clas...",,,,,,,,105
184,21.208539,0.646932,0.000000,0.000000,,,DecisionTreeClassifier(),"[{0: 1}, {1: 9}]",log_loss,log2,...,best,"{'classifier': DecisionTreeClassifier(), 'clas...",,,,,,,,104
185,21.744296,1.034403,0.000000,0.000000,,,DecisionTreeClassifier(),"[{0: 1}, {1: 9}]",log_loss,log2,...,random,"{'classifier': DecisionTreeClassifier(), 'clas...",,,,,,,,103
186,22.010172,0.504091,0.000000,0.000000,,,DecisionTreeClassifier(),"[{0: 1}, {1: 9}]",log_loss,none,...,best,"{'classifier': DecisionTreeClassifier(), 'clas...",,,,,,,,136


