In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score, mean_squared_error, \
    accuracy_score, precision_score, recall_score, f1_score

In [5]:
df = pd.read_csv("aps_failure_training_set.csv",skiprows=20,na_values="na")
df['class']=df['class'].map({'pos':1, 'neg':0})
df = df.apply(lambda col: pd.to_numeric(col, downcast = 'float'))

In [6]:
df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0.0,76698.0,,2130706000.0,280.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,0.0,33058.0,,0.0,,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,0.0,41040.0,,228.0,100.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,0.0,12.0,0.0,70.0,66.0,0.0,10.0,0.0,0.0,0.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,0.0,60874.0,,1368.0,458.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 171 entries, class to eg_000
dtypes: float32(171)
memory usage: 39.1 MB


In [20]:
preprocessing = Pipeline([
('impute', SimpleImputer(strategy='most_frequent')),
('scale', StandardScaler())
])

In [40]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='class'), df['class'],test_size=0.3,stratify=df['class'])

In [57]:
def quality_report(prediction, actual, verbose=True):
    report_str = "Accuracy: \t {:.3f}\n" +\
                 "Precision: \t {:.3f}\n" +\
                 "Recall: \t {:.3f}\n" +\
                 "f1_score: \t {:.3f}\n" +\
                 "ROC-AUC: \t {:.3f}"
    
    acc = accuracy_score(prediction, actual)
    precision = precision_score(prediction, actual)
    recall = recall_score(prediction, actual)
    f1 = f1_score(prediction, actual)
    rocauc = roc_auc_score(prediction, actual)
    
    quality_list = [acc, precision, recall, f1, rocauc]
    
    if verbose:
        print("\n=== Quality Report ===")
        print(report_str.format(
            accuracy_score(prediction, actual),
            precision_score(prediction, actual),
            recall_score(prediction, actual),
            f1_score(prediction, actual),
            roc_auc_score(prediction, actual)
        ))
        print("======================\n")
    
    return quality_list

# modeling

## logistic regresstion

In [58]:
from sklearn.linear_model import LogisticRegressionCV
model_logistic = make_pipeline(preprocessing, LogisticRegressionCV(cv=StratifiedKFold(3), max_iter=10000))

In [96]:
%%time
model_logistic.fit(X_train, y_train);
quality_report(model_logistic.predict(X_test), y_test)


=== Quality Report ===
Accuracy: 	 0.990
Precision: 	 0.510
Recall: 	 0.814
f1_score: 	 0.627
ROC-AUC: 	 0.903

Wall time: 4min 28s


[0.9898888888888889,
 0.51,
 0.8138297872340425,
 0.6270491803278688,
 0.9027884619978881]

# Nearest neighbors

на моей машине обучить KNN на всем трейне не получилось за разумное время.
принял решение выкинуть лишние записи(выборка несбалансированная)

In [69]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [75]:
%%time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
model_knn = make_pipeline(preprocessing, KNeighborsClassifier())
params={'kneighborsclassifier__n_neighbors': [i+1 for i in range(100)]}
grid = RandomizedSearchCV(model_knn,params, cv=StratifiedKFold(3), n_iter=50, verbose= True, n_jobs=-1)
grid.fit(X_resampled, y_resampled)
quality_report(y_test, grid.predict(X_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   17.0s finished



=== Quality Report ===
Accuracy: 	 0.954
Precision: 	 0.256
Recall: 	 0.920
f1_score: 	 0.400
ROC-AUC: 	 0.937

Wall time: 26 s


[0.9540555555555555,
 0.2557924003707136,
 0.92,
 0.4002900652646845,
 0.937316384180791]

# Tree

In [72]:
from sklearn.tree import DecisionTreeClassifier

In [84]:
model_tree = make_pipeline(preprocessing, DecisionTreeClassifier())
params={'decisiontreeclassifier__max_depth': [i+1 for i in range(100)],
       'decisiontreeclassifier__splitter':["best", "random"],
       'decisiontreeclassifier__min_samples_leaf':[i*5+1 for i in range(10)]}
grid = RandomizedSearchCV(model_tree,params, cv=StratifiedKFold(5), n_iter=40, verbose= True, n_jobs=-1)
grid.fit(X_resampled, y_resampled)
quality_report(y_test, grid.predict(X_test))

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    9.4s finished



=== Quality Report ===
Accuracy: 	 0.951
Precision: 	 0.244
Recall: 	 0.927
f1_score: 	 0.386
ROC-AUC: 	 0.939



[0.9508888888888889,
 0.24385964912280703,
 0.9266666666666666,
 0.3861111111111111,
 0.9389830508474576]

In [85]:
grid.best_params_

{'decisiontreeclassifier__splitter': 'best',
 'decisiontreeclassifier__min_samples_leaf': 21,
 'decisiontreeclassifier__max_depth': 64}

# SVM

In [88]:
from sklearn.svm import SVC
model_svm = make_pipeline(preprocessing, SVC())
params={'svc__C': np.linspace(0.1,10,50),
       'svc__kernel':["linear", "poly", "rbf"]
       }
grid = RandomizedSearchCV(model_svm,params, cv=StratifiedKFold(3), n_iter=40, verbose= True, n_jobs=-1)
grid.fit(X_resampled, y_resampled)
quality_report(y_test, grid.predict(X_test))

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   10.8s finished



=== Quality Report ===
Accuracy: 	 0.955
Precision: 	 0.262
Recall: 	 0.947
f1_score: 	 0.410
ROC-AUC: 	 0.951



[0.9545555555555556,
 0.26151012891344383,
 0.9466666666666667,
 0.4098124098124098,
 0.950677966101695]

In [91]:
grid.best_params_

{'svc__kernel': 'rbf', 'svc__C': 6.5653061224489795}

In [95]:
pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__kernel,param_svc__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
30,0.210844,0.012614,0.030523,0.004908,rbf,6.56531,"{'svc__kernel': 'rbf', 'svc__C': 6.56530612244...",0.939286,0.953571,0.957143,0.953571,0.953571,0.951429,0.006227,1
12,0.190297,0.005935,0.027513,0.000809,rbf,5.95918,"{'svc__kernel': 'rbf', 'svc__C': 5.95918367346...",0.939286,0.953571,0.957143,0.953571,0.95,0.950714,0.006145,2
2,0.204054,0.012403,0.027726,0.002554,rbf,8.58571,"{'svc__kernel': 'rbf', 'svc__C': 8.58571428571...",0.939286,0.953571,0.957143,0.95,0.953571,0.950714,0.006145,3
3,0.22831,0.02511,0.035749,0.003675,rbf,7.17143,"{'svc__kernel': 'rbf', 'svc__C': 7.17142857142...",0.939286,0.953571,0.957143,0.95,0.953571,0.950714,0.006145,3
29,0.211349,0.014782,0.035917,0.009299,rbf,7.97959,"{'svc__kernel': 'rbf', 'svc__C': 7.97959183673...",0.939286,0.953571,0.957143,0.95,0.953571,0.950714,0.006145,3
10,0.201061,0.021134,0.031766,0.003797,rbf,7.37347,"{'svc__kernel': 'rbf', 'svc__C': 7.37346938775...",0.939286,0.953571,0.957143,0.95,0.953571,0.950714,0.006145,3
21,0.218024,0.017314,0.033902,0.006331,rbf,7.57551,"{'svc__kernel': 'rbf', 'svc__C': 7.57551020408...",0.939286,0.953571,0.957143,0.95,0.953571,0.950714,0.006145,3
33,0.193567,0.007022,0.026637,0.00119,rbf,9.59592,"{'svc__kernel': 'rbf', 'svc__C': 9.59591836734...",0.935714,0.953571,0.957143,0.95,0.953571,0.95,0.007491,8
7,0.191597,0.00869,0.027815,0.000676,rbf,4.74694,"{'svc__kernel': 'rbf', 'svc__C': 4.74693877551...",0.939286,0.953571,0.957143,0.95,0.95,0.95,0.005976,8
25,0.200786,0.018607,0.0284,0.001322,rbf,4.5449,"{'svc__kernel': 'rbf', 'svc__C': 4.54489795918...",0.939286,0.953571,0.957143,0.95,0.95,0.95,0.005976,8


# Выводы

SVM позволяет прилично прибавить в качестве, ROC 0.95 против 0,88 у логистической регрессии

## парочка доп. экспериментов

попробуем логистическую регрессию отсчитать также на сбалансированной выборке

In [97]:
%%time
model_logistic2 = make_pipeline(preprocessing, LogisticRegressionCV(cv=StratifiedKFold(3), max_iter=10000))
model_logistic2.fit(X_resampled, y_resampled);
quality_report(model_logistic2.predict(X_test), y_test)


=== Quality Report ===
Accuracy: 	 0.959
Precision: 	 0.937
Recall: 	 0.280
f1_score: 	 0.431
ROC-AUC: 	 0.640

Wall time: 5.97 s


[0.9588333333333333,
 0.9366666666666666,
 0.28015952143569295,
 0.4313123561013047,
 0.6395208385551118]

пропробуем продвинутые методы андерсэмплинга

In [98]:
def score_dataset(X_resampled, y_resampled):    
    model_svm = make_pipeline(preprocessing, SVC())
    params={'svc__C': np.linspace(0.1,10,50),
            'svc__kernel':["linear", "poly","rbf"]
           }
    grid = RandomizedSearchCV(model_svm,params, cv=StratifiedKFold(3), n_iter=40, verbose= True, n_jobs=-1)
    grid.fit(X_resampled, y_resampled)
    quality_report(y_test, grid.predict(X_test))

In [102]:
%%time
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_resample(SimpleImputer(strategy='most_frequent').fit_transform(X_train), y_train)
score_dataset(X_resampled, y_resampled)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   40.0s finished



=== Quality Report ===
Accuracy: 	 0.593
Precision: 	 0.034
Recall: 	 0.847
f1_score: 	 0.065
ROC-AUC: 	 0.718

Wall time: 11min 56s


In [103]:
%%time
from imblearn.under_sampling import NearMiss
nm1 = NearMiss(version=1)
X_resampled, y_resampled = nm1.fit_resample(SimpleImputer(strategy='most_frequent').fit_transform(X_train), y_train)
score_dataset(X_resampled, y_resampled)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    4.3s finished



=== Quality Report ===
Accuracy: 	 0.340
Precision: 	 0.025
Recall: 	 0.997
f1_score: 	 0.048
ROC-AUC: 	 0.663

Wall time: 10.7 s


In [104]:
%%time
from imblearn.under_sampling import NearMiss
nm2 = NearMiss(version=2)
X_resampled, y_resampled = nm2.fit_resample(SimpleImputer(strategy='most_frequent').fit_transform(X_train), y_train)
score_dataset(X_resampled, y_resampled)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   12.9s finished



=== Quality Report ===
Accuracy: 	 0.216
Precision: 	 0.019
Recall: 	 0.903
f1_score: 	 0.037
ROC-AUC: 	 0.554

Wall time: 30.6 s


In [105]:
%%time
from imblearn.under_sampling import NearMiss
nm3 = NearMiss(version=3)
X_resampled, y_resampled = nm3.fit_resample(SimpleImputer(strategy='most_frequent').fit_transform(X_train), y_train)
score_dataset(X_resampled, y_resampled)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   48.0s finished



=== Quality Report ===
Accuracy: 	 0.163
Precision: 	 0.015
Recall: 	 0.753
f1_score: 	 0.029
ROC-AUC: 	 0.453

Wall time: 59 s


In [106]:
%%time
from imblearn.under_sampling import EditedNearestNeighbours
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_resample(SimpleImputer(strategy='most_frequent').fit_transform(X_train), y_train)
score_dataset(X_resampled, y_resampled)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 25.8min finished



=== Quality Report ===
Accuracy: 	 0.990
Precision: 	 0.664
Recall: 	 0.763
f1_score: 	 0.710
ROC-AUC: 	 0.878

Wall time: 27min 25s


попробуем SVM на всем датасете

In [108]:
%%time
from sklearn.svm import SVC
model_svm = make_pipeline(preprocessing, SVC())
params={'svc__C': np.linspace(0.1,10,50),
       'svc__kernel':["linear", "poly", "rbf"]
       }
grid = RandomizedSearchCV(model_svm,params, cv=StratifiedKFold(3), n_iter=40, verbose= True, n_jobs=-1)
grid.fit(X_train, y_train)
quality_report(y_test, grid.predict(X_test))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 58.5min finished



=== Quality Report ===
Accuracy: 	 0.991
Precision: 	 0.763
Recall: 	 0.677
f1_score: 	 0.717
ROC-AUC: 	 0.837

Wall time: 58min 49s


[0.9911111111111112,
 0.7631578947368421,
 0.6766666666666666,
 0.7173144876325089,
 0.8365536723163842]