## Data description
Faulty steel plates

https://www.kaggle.com/uciml/faulty-steel-plates/data

Content

There are 34 fields. The first 27 fields describe some kind of steel plate faults seen in images. Unfortunately, there is no other information that I know of to describe these columns.

    X_Minimum
    X_Maximum
    Y_Minimum
    Y_Maximum
    Pixels_Areas
    X_Perimeter
    Y_Perimeter
    Sum_of_Luminosity
    Minimum_of_Luminosity
    Maximum_of_Luminosity
    Length_of_Conveyer
    TypeOfSteel_A300
    TypeOfSteel_A400
    Steel_Plate_Thickness
    Edges_Index
    Empty_Index
    Square_Index
    Outside_X_Index
    Edges_X_Index
    Edges_Y_Index
    Outside_Global_Index
    LogOfAreas
    Log_X_Index
    Log_Y_Index
    Orientation_Index
    Luminosity_Index
    SigmoidOfAreas

The last seven columns are one hot encoded classes, i.e. if the plate fault is classified as "Stains" there will be a 1 in that column and 0's in the other columns.

    Pastry
    Z_Scratch
    K_Scatch
    Stains
    Dirtiness
    Bumps
    Other_Faults


In [181]:
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
from scipy.stats import sem

print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('scikit-learn version:', sk.__version__)
print('matplotlib version:', matplotlib.__version__)

%matplotlib inline

numpy version: 1.12.1
pandas version: 0.20.1
scikit-learn version: 0.18.1
matplotlib version: 2.0.2


In [182]:
datafolder = 'data/'
filename = 'faults.csv'
df = pd.read_csv(datafolder + filename, sep=',', low_memory=False, encoding = 'ISO-8859-1')
print(df.shape)
df.head()

(1941, 34)


Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,42,50,270900,270944,267,17,44,24220,76,108,...,0.8182,-0.2913,0.5822,1,0,0,0,0,0,0
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0.7931,-0.1756,0.2984,1,0,0,0,0,0,0
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,0.6667,-0.1228,0.215,1,0,0,0,0,0,0
3,853,860,369370,369415,176,13,45,18996,99,126,...,0.8444,-0.1568,0.5212,1,0,0,0,0,0,0
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0.9338,-0.1992,1.0,1,0,0,0,0,0,0


In [183]:
y_pd = df[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]

y = []

for index, row in y_pd.iterrows():
    y.append(row.argmax())
    
y = pd.Series(y)
    
df['Label'] = y.map({'Pastry': 0, 'Z_Scratch': 1, 'K_Scatch':2, 'Stains': 3, 'Dirtiness': 4, 'Bumps': 5, 'Other_Faults': 6})

df.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults,Label
0,42,50,270900,270944,267,17,44,24220,76,108,...,-0.2913,0.5822,1,0,0,0,0,0,0,0
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,-0.1756,0.2984,1,0,0,0,0,0,0,0
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,-0.1228,0.215,1,0,0,0,0,0,0,0
3,853,860,369370,369415,176,13,45,18996,99,126,...,-0.1568,0.5212,1,0,0,0,0,0,0,0
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,-0.1992,1.0,1,0,0,0,0,0,0,0


In [184]:
X = df.loc[:, 'X_Minimum':'SigmoidOfAreas']

y = df['Label']

X


Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,42,50,270900,270944,267,17,44,24220,76,108,...,0.0047,0.4706,1.0000,1.0,2.4265,0.9031,1.6435,0.8182,-0.2913,0.5822
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0.0036,0.6000,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,0.0037,0.7500,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.2150
3,853,860,369370,369415,176,13,45,18996,99,126,...,0.0052,0.5385,1.0000,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0.0126,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0000
5,430,441,100250,100337,630,20,87,62357,64,127,...,0.0079,0.5500,1.0000,1.0,2.7993,1.0414,1.9395,0.8736,-0.2267,0.9874
6,413,446,138468,138883,9052,230,432,1481991,23,199,...,0.0196,0.1435,0.9607,1.0,3.9567,1.5185,2.6181,0.9205,0.2791,1.0000
7,190,200,210936,210956,132,11,20,20007,124,172,...,0.0059,0.9091,1.0000,1.0,2.1206,1.0000,1.3010,0.5000,0.1841,0.3359
8,330,343,429227,429253,264,15,26,29748,53,148,...,0.0077,0.8667,1.0000,1.0,2.4216,1.1139,1.4150,0.5000,-0.1197,0.5593
9,74,90,779144,779308,1506,46,167,180215,53,143,...,0.0095,0.3478,0.9820,1.0,3.1778,1.2041,2.2148,0.9024,-0.0651,1.0000


In [185]:
# Standarize the features
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [186]:
# Split the dataset into a training and a testing set; test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=51)
print(X_train.shape, y_train.shape)


(1455, 27) (1455,)


In [192]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
clf = []

clf.append(("SVC linear", SVC(kernel='linear')))
clf.append(("SVC RBF", SVC(kernel='rbf')))
clf.append(("MLP 1 thick", MLPClassifier(hidden_layer_sizes=(27,), random_state=1)))
clf.append(("MLP 1 regular", MLPClassifier(hidden_layer_sizes=(13,), random_state=1)))
clf.append(("MLP 1 slim", MLPClassifier(hidden_layer_sizes=(5,), random_state=1)))
clf.append(("MLP 2 regular", MLPClassifier(hidden_layer_sizes=(18,9), random_state=1)))
clf.append(("MLP 2 thick", MLPClassifier(hidden_layer_sizes=(27,25), random_state=1)))
clf.append(("MLP 2 slim", MLPClassifier(hidden_layer_sizes=(10,5), random_state=1)))
clf.append(("MLP 3 regular", MLPClassifier(hidden_layer_sizes=(27, 15, 7), random_state=1)))
clf.append(("MLP 3 thick", MLPClassifier(hidden_layer_sizes=(27,26,25), random_state=1)))
clf.append(("MLP 3 slim", MLPClassifier(hidden_layer_sizes=(10, 7, 3), random_state=1)))

for c in clf:
    # Fit (= train) the classifier
    c[1].fit(X_train, y_train)



In [193]:
for c in clf:
    y_pred = c[1].predict(X_test)
    s = metrics.accuracy_score(y_test, y_pred)
    print(c[0])
    print(s)
    print()

SVC linear
0.732510288066

SVC RBF
0.748971193416

MLP 1 thick
0.742798353909

MLP 1 regular
0.732510288066

MLP 1 slim
0.621399176955

MLP 2 regular
0.742798353909

MLP 2 thick
0.748971193416

MLP 2 slim
0.670781893004

MLP 3 regular
0.740740740741

MLP 3 thick
0.736625514403

MLP 3 slim
0.699588477366



In [194]:
def evaluate_cross_validation(clf, X, y, K):
    # Create a k-fold cross validation iterator
    kf = KFold(K, shuffle=True, random_state=0)
    
    # By default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=kf)
    
    # scores = cross_val_score(clf, X, y, cv=K)     # This also works directly, without KFold function!
    
    print(scores)
    print("Mean score: {0:.3f} (+/-{1:.3f})".format(
        np.mean(scores), sem(scores)))
    
for c in clf:    
    print(c[0])
    evaluate_cross_validation(c[1], X_train, y_train, 5)
    print()

SVC linear
[ 0.6975945   0.71134021  0.68728522  0.72164948  0.72164948]
Mean score: 0.708 (+/-0.007)

SVC RBF
[ 0.74226804  0.74914089  0.72164948  0.72508591  0.7628866 ]
Mean score: 0.740 (+/-0.008)

MLP 1 thick
[ 0.74914089  0.74914089  0.73539519  0.71821306  0.73539519]
Mean score: 0.737 (+/-0.006)

MLP 1 regular
[ 0.70790378  0.74226804  0.70790378  0.73539519  0.72852234]
Mean score: 0.724 (+/-0.007)

MLP 1 slim
[ 0.63230241  0.604811    0.62199313  0.56013746  0.62199313]
Mean score: 0.608 (+/-0.013)

MLP 2 regular
[ 0.73883162  0.73883162  0.73195876  0.70790378  0.74226804]
Mean score: 0.732 (+/-0.006)

MLP 2 thick
[ 0.78694158  0.7628866   0.73539519  0.72852234  0.73195876]
Mean score: 0.749 (+/-0.011)

MLP 2 slim
[ 0.63573883  0.64261168  0.69415808  0.68728522  0.65292096]
Mean score: 0.663 (+/-0.012)

MLP 3 regular
[ 0.72164948  0.74226804  0.72852234  0.70790378  0.73539519]
Mean score: 0.727 (+/-0.006)

MLP 3 thick
[ 0.73195876  0.72852234  0.75257732  0.74914089  0.7

In [None]:
ensemble = VotingClassifier(clf, weights=[1,1.7,1.5,1.4,1,1,2,1,1,1,1])

evaluate_cross_validation(ensemble, X_train, y_train, 5)