In [13]:
from google.colab import files
src = list(files.upload().values())[0]
open('handling_outliers.py','wb').write(src)

Saving handling_outliers.py to handling_outliers (1).py


1844

In [14]:
#imports
import pandas as pd
import pprint
import numpy as np
from handling_outliers import removing_iqr, removing_percentiles, zscore_outlier, modified_z_score_outlier, count_outliers, mask_outliers, replace_missing_values
from sklearn.preprocessing import RobustScaler, binarize
from sklearn.dummy import DummyClassifier
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [15]:
#loading
X = pd.read_csv('/content/drive/MyDrive/train_data.csv', header = None)
y = pd.read_csv('/content/drive/MyDrive/train_labels.csv', header = None)

In [None]:
#outliers
X_rem = removing_iqr(pd.DataFrame(X))
counted = count_outliers(X_rem)
contamination = counted[0].sum() / (X_rem.shape[0] * X_rem.shape[1]) * 100
masked = mask_outliers(pd.DataFrame(X), X_rem)
replaced = replace_missing_values(masked, 5)
print(counted)
print(f'Dataset contamination: {round(contamination, 2)} %')
print('_'*10)

(0       82
1       77
2       51
3       62
4       80
        ..
3745    68
3746    65
3747    79
3748    78
3749    68
Length: 3750, dtype: int64,            0
3667  0.0105
2844  0.0099
2524  0.0098
2820  0.0097
3458  0.0096
...      ...
3204  0.0048
7     0.0048
909   0.0047
3594  0.0046
345   0.0046

[3750 rows x 1 columns])
Dataset contamination: 0.71 %
__________


In [16]:
#baseline
methods = ["most_frequent", "prior", "stratified", "uniform"]
scores = {}

for method in methods:
  dummy_clf = DummyClassifier(strategy=method)
  dummy_clf.fit(X, y)
  y_pred = dummy_clf.predict(X)
  precision = precision_score(y, y_pred)
  recall = recall_score(y, y_pred)
  acc = balanced_accuracy_score(y, y_pred)
  f1 = f1_score(y, y_pred, average='weighted')
  roc_score = roc_auc_score(y, y_pred, average='weighted')
  scores[method] = {'acc':dummy_clf.score(X, y), 
                    'precision':precision, 
                    'balanced accuracy':acc,
                    'recall':recall, 
                    'accuracy_balanced':acc, 
                    'f1_weighted':f1, 
                    'roc_score':roc_score}

pprint.pprint(scores)

{'most_frequent': {'acc': 0.9,
                   'accuracy_balanced': 0.5,
                   'balanced accuracy': 0.5,
                   'f1_weighted': 0.8526315789473684,
                   'precision': 0.9,
                   'recall': 1.0,
                   'roc_score': 0.5},
 'prior': {'acc': 0.9,
           'accuracy_balanced': 0.5,
           'balanced accuracy': 0.5,
           'f1_weighted': 0.8526315789473684,
           'precision': 0.9,
           'recall': 1.0,
           'roc_score': 0.5},
 'stratified': {'acc': 0.8194666666666667,
                'accuracy_balanced': 0.5127407407407407,
                'balanced accuracy': 0.5127407407407407,
                'f1_weighted': 0.824696019914174,
                'precision': 0.902547393364929,
                'recall': 0.9028148148148148,
                'roc_score': 0.5127407407407407},
 'uniform': {'acc': 0.4976,
             'accuracy_balanced': 0.48785185185185187,
             'balanced accuracy': 0.48785185185185187,

In [None]:
scaler = RobustScaler()
X = scaler.fit_transform(X)

In [17]:
#encoding
y = pd.DataFrame(binarize(y))

In [None]:
#dimensionality reduction using PCA
pca = PCA(n_components=0.95, whiten=True)
X = pca.fit_transform(X)
X.shape

(3750, 1012)

In [18]:
#dimensionality reduction using kPCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe = Pipeline(
    steps=[
           ('kpca', KernelPCA(n_components = 1012)),
           ("classifier", LogisticRegression())
           ]
    )

In [27]:
# model_params = [
                # {"classifier" : [LogisticRegression()],
                #  "classifier__solver" : ['sag', 'saga', 'liblinear'],
                # "classifier__penalty" : ['l1', 'l2', 'elasticnet', 'none'],
                # "classifier__C" : C_value,
                # "classifier__class_weight" : ['balanced', {"0" : 0.1, "1" : 0.9}]}
                # ,
                # {"classifier" : [RandomForestClassifier()],
                # "classifier__n_estimators" : [10, 100, 1000],
                # "classifier__criterion" : ['gini', 'entropy', 'log_loss'],
                # "classifier__max_features" : ['auto', 'sqrt', 'log2', 'none'],
                # "classifier__class_weight" : ['balanced', [{0 : 1}, {1 : 9}]]},
                # {"classifier":[KNeighborsClassifier()],
                # "classifier__n_neighbors":[3,5,7,9,11,13,15,17,19,21],
                #  "classifier__weights": ['uniform', 'distance']}
                # ]

model_params = [{"kpca__gamma" : np.linspace(0.03, 0.05, 2),
                 "kpca__kernel" : [
                                   #'linear',
                                  'poly'
                                  , 'rbf'
                                  , 'sigmoid'
                                  , 'cosine'
                                  # , 'precomputed'
                 ]},
                {"classifier" : [LogisticRegression()],
                 "classifier__solver" : ['sag', 'saga', 'liblinear'],
                "classifier__penalty" : ['l1', 'l2', 'elasticnet', 'none'],
                "classifier__C" : [np.logspace(1, 4, 10)],
                "classifier__class_weight" : ['balanced', {"0" : 0.1, "1" : 0.9}]},
                {"classifier" : [BernoulliNB()],
                "classifier__alpha" : [np.linspace(1, 10, 100)]},
                {"classifier" : [RandomForestClassifier()],
                "classifier__n_estimators" : [10, 100, 1000],
                "classifier__criterion" : ['gini', 'entropy', 'log_loss'],
                "classifier__max_features" : ['auto', 'sqrt', 'log2', 'none'],
                "classifier__class_weight" : ['balanced', [{0 : 1}, {1 : 9}]]},
                {"classifier" : [KNeighborsClassifier()],
                "classifier__weights" : ['uniform', 'distance'],
                "classifier__algorithm" : ['auto', 'kd-tree', 'brute'],
                "classifier__n_neighbors" : [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]},
                {"classifier" : [SVC()],
                "classifier__C" : [np.logspace(1, 10000, 1000)],
                "classifier__class_weight" : ['balanced', [{0 : 1}, {1 : 9}]]},
                {"classifier" : [MLPClassifier()]},
                {"classifier" : [DecisionTreeClassifier()],
                "classifier__criterion" : ['gini', 'entropy', 'log_loss'],
                "classifier__splitter" : ['best', 'random'],
                "classifier__max_features" : ['auto', 'sqrt', 'log2', 'none'],
                "classifier__class_weight" : ['balanced', [{0 : 1}, {1 : 9}]]},
                {"classifier" : [ExtraTreesClassifier()],
                "classifier__n_estimators" : [10, 100, 1000],
                "classifier__criterion" : ['gini', 'entropy', 'log_loss'],
                "classifier__max_features" : ['sqrt', 'log2', 'none'],
                "classifier__class_weight" : ['balanced', [{0 : 1}, {1 : 9}]]}
                ]

  return _nx.power(base, y)


In [28]:
scores = []

#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True)

search = GridSearchCV(pipe, model_params, scoring="balanced_accuracy", refit=True, verbose=3)

In [29]:
result = search.fit(X_train, y_train.values.flatten())

Fitting 5 folds for each of 270 candidates, totalling 1350 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 1/5] END kpca__gamma=0.03, kpca__kernel=poly;, score=0.692 total time=  12.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 2/5] END kpca__gamma=0.03, kpca__kernel=poly;, score=0.690 total time=  12.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 3/5] END kpca__gamma=0.03, kpca__kernel=poly;, score=0.708 total time=  12.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 4/5] END kpca__gamma=0.03, kpca__kernel=poly;, score=0.668 total time=  12.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 5/5] END kpca__gamma=0.03, kpca__kernel=poly;, score=0.689 total time=  12.8s
[CV 1/5] END kpca__gamma=0.03, kpca__kernel=rbf;, score=0.500 total time=   6.3s
[CV 2/5] END kpca__gamma=0.03, kpca__kernel=rbf;, score=0.500 total time=   6.2s
[CV 3/5] END kpca__gamma=0.03, kpca__kernel=rbf;, score=0.500 total time=   6.0s
[CV 4/5] END kpca__gamma=0.03, kpca__kernel=rbf;, score=0.500 total time=   6.1s
[CV 5/5] END kpca__gamma=0.03, kpca__kernel=rbf;, score=0.500 total time=   6.2s
[CV 1/5] END kpca__gamma=0.03, kpca__kernel=sigmoid;, score=0.517 total time=  11.9s
[CV 2/5] END kpca__gamma=0.03, kpca__kernel=sigmoid;, score=0.565 total time=  11.2s
[CV 3/5] END kpca__gamma=0.03, kpca__kernel=sigmoid;, score=0.576 total time=  11.2s
[CV 4/5] END kpca__gamma=0.03, kpca__kernel=sigmoid;, score=0.539 total time=  11.1s
[CV 5/5] END kpca__gamma=0.03, kpca__kernel=sigmoid;, score=0.575 total time=  10.9s
[CV 1/5] END kpca__gamma=0.03, kpca__kernel=cosine;, score=0.500 total time=  11.6s
[CV 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 1/5] END kpca__gamma=0.05, kpca__kernel=poly;, score=0.686 total time=  12.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 2/5] END kpca__gamma=0.05, kpca__kernel=poly;, score=0.692 total time=  12.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 3/5] END kpca__gamma=0.05, kpca__kernel=poly;, score=0.708 total time=  12.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 4/5] END kpca__gamma=0.05, kpca__kernel=poly;, score=0.668 total time=  12.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV 5/5] END kpca__gamma=0.05, kpca__kernel=poly;, score=0.699 total time=  12.4s
[CV 1/5] END kpca__gamma=0.05, kpca__kernel=rbf;, score=0.500 total time=   6.3s
[CV 2/5] END kpca__gamma=0.05, kpca__kernel=rbf;, score=0.500 total time=   6.2s
[CV 3/5] END kpca__gamma=0.05, kpca__kernel=rbf;, score=0.500 total time=   6.1s
[CV 4/5] END kpca__gamma=0.05, kpca__kernel=rbf;, score=0.500 total time=   6.2s
[CV 5/5] END kpca__gamma=0.05, kpca__kernel=rbf;, score=0.500 total time=   6.3s
[CV 1/5] END kpca__gamma=0.05, kpca__kernel=sigmoid;, score=0.517 total time=  11.1s
[CV 2/5] END kpca__gamma=0.05, kpca__kernel=sigmoid;, score=0.565 total time=  10.8s
[CV 3/5] END kpca__gamma=0.05, kpca__kernel=sigmoid;, score=0.576 total time=  11.2s
[CV 4/5] END kpca__gamma=0.05, kpca__kernel=sigmoid;, score=0.539 total time=  10.9s
[CV 5/5] END kpca__gamma=0.05, kpca__kernel=sigmoid;, score=0.575 total time=  11.2s
[CV 1/5] END kpca__gamma=0.05, kpca__kernel=cosine;, score=0.500 total time=  11.3s
[CV 

895 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_s

In [31]:
import joblib
file = 'grid_search_model.sav'
joblib.dump(result, filename = file)

['grid_search_model.sav']