In [23]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix
from sklearn.model_selection import KFold, train_test_split, TimeSeriesSplit, RepeatedStratifiedKFold, cross_val_score
import pickle
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from numpy import mean

In [26]:
df = pd.read_csv("../data/processed/predictive_maintenance.csv")
df = df.set_index("UDI")#.drop(columns="Unnamed: 0")
first_column = df.pop('Target')
df.insert(0, 'Target', first_column)

train, test = train_test_split(df, test_size=0.05, random_state=42)


from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline 

# Définition de l'instance SMOTE
sm = SMOTE(sampling_strategy=0.75)
to_model_columns= train.columns[1::]
X, y = train[to_model_columns], train.Target

# define resampling
#oversampling des machines fonctionnelles
over = SMOTE(sampling_strategy=0.1)
#undersampling des machines défaillantes
under = RandomUnderSampler(sampling_strategy=0.5)

# SVC
clf = SVC(gamma='auto')

# Application du SMOTE aux données
X_train, Y_train = sm.fit_resample(X, y)
clf.fit(X_train, Y_train)

# define pipeline
pipeline = Pipeline(steps=[('o', over), ('u', under), ('m', clf)])
#pipeline.fit(X_train, Y_train)

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(pipeline, X_train, Y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))



In [27]:
Y_train

0        0
1        0
2        0
3        0
4        0
        ..
16056    1
16057    1
16058    1
16059    1
16060    1
Name: Target, Length: 16061, dtype: int64

In [28]:
# save the model to disk
filename = '../models/svm_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [29]:
X_train

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_H,Type_L,Type_M
0,300.700000,309.900000,1815,27.000000,190,0,1,0
1,300.900000,310.600000,1552,34.900000,95,0,1,0
2,300.700000,310.900000,1723,31.000000,73,0,1,0
3,300.100000,311.300000,1547,40.000000,201,0,1,0
4,296.700000,307.500000,1515,42.100000,55,0,0,1
...,...,...,...,...,...,...,...,...
16056,302.332266,310.414187,1325,60.632957,89,0,0,0
16057,301.933856,310.474756,1332,56.268484,159,0,0,0
16058,301.837865,311.490292,1284,68.195146,5,0,1,0
16059,303.225524,311.648787,1340,48.011962,209,0,0,0


In [23]:
pd.set_option('display.max_columns', None)

test

Unnamed: 0_level_0,Target,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_H,Type_L,Type_M
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6253,0,300.8,310.3,1538,36.1,198,0,1,0
4685,1,303.6,311.8,1421,44.8,101,0,0,1
1732,0,298.3,307.9,1485,42.0,117,0,0,1
4743,0,303.3,311.3,1592,33.7,14,0,1,0
4522,0,302.4,310.4,1865,23.9,129,0,1,0
...,...,...,...,...,...,...,...,...,...
1745,0,298.5,308.0,1461,51.9,157,0,0,1
9755,0,298.5,309.7,1552,34.0,205,0,0,1
6095,0,300.9,310.8,1402,48.1,22,0,0,1
8782,0,297.6,308.7,1552,42.3,27,0,1,0


Feature importancy

In [14]:
from scipy.stats import pointbiserialr
from math import sqrt
import numpy as np
def getMerit(subset, label):
    k = len(subset)
    best_value = -1
    best_feature = ''
    # average feature-class correlation
    rcf_all = []
    for feature in subset:
        coeff = pointbiserialr( df[label], df[feature] )
        abs_coeff = abs( coeff.correlation )
        rcf_all.append( abs_coeff )
             
        if abs_coeff > best_value:
            best_value = abs_coeff
            best_feature = feature
    rcf = np.mean( rcf_all )

    # average feature-feature correlation
    corr = df[subset].corr()
    corr.values[np.tril_indices_from(corr.values)] = np.nan
    corr = abs(corr)
    rff = corr.unstack().mean()
    print("Feature %s with merit %.4f"%(best_feature, best_value))

    return (k * rcf) / sqrt(k + k * (k-1) * rff)

In [24]:
features = test.columns.tolist()
features.remove('Target')

getMerit(features, 'Target')

Feature Torque [Nm] with merit 0.1913


0.1409000826900275