In [1]:
import random
import math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, zero_one_loss
from sklearn.preprocessing import scale
import itertools
from joblib import Parallel, delayed
import multiprocessing

%matplotlib inline

In [2]:
# Loading the data
df1 = pd.read_csv('data/cancer.csv')
df1.dropna(inplace=True)

df2 = pd.read_csv('data/cover.csv')
df2.dropna(inplace=True)

df3 = pd.read_csv('data/aps_failure.csv')
df3.dropna(inplace=True)

In [3]:
df1.drop('1000025', axis=1, inplace=True)
df1.columns = ['x_'+str(i) for i in range(1,len(df1.columns)+1)]
df1.rename({'x_10': 'Target'},  axis='columns', inplace=True)
df1['Target'] = df1.Target.apply(lambda r: 1 if r==2 else -1)

col1 = ['x_'+str(i) for i in range(1,10)]
df1

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,Target
0,5,4,4,5,7,10.0,3,2,1,1
1,3,1,1,1,2,2.0,3,1,1,1
2,6,8,8,1,3,4.0,3,7,1,1
3,4,1,1,3,2,1.0,3,1,1,1
4,8,10,10,8,7,10.0,9,7,1,-1
...,...,...,...,...,...,...,...,...,...,...
693,3,1,1,1,3,2.0,1,1,1,1
694,2,1,1,1,2,1.0,1,1,1,1
695,5,10,10,3,7,3.0,8,10,2,-1
696,4,8,6,4,3,4.0,10,6,1,-1


In [4]:
df2.rename({'y': 'Target'},  axis='columns', inplace=True)
df2['Target'] = df2.Target.apply(lambda r: 1 if r==0 else -1)

col2 = ['x_'+str(i) for i in range(1,11)]
df2

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,Target
0,2804,139,9,268,65,3180,234,238,135,6121,1
1,2785,155,18,242,118,3090,238,238,122,6211,1
2,2579,132,6,300,-15,67,230,237,140,6031,1
3,2886,151,11,371,26,5253,234,240,136,4051,1
4,2742,134,22,150,69,3215,248,224,92,6091,1
...,...,...,...,...,...,...,...,...,...,...,...
286043,2617,29,13,390,128,2081,215,211,130,592,1
286044,2614,21,13,379,125,2051,211,212,135,618,1
286045,2612,17,13,371,123,2021,208,211,138,644,1
286046,2610,16,14,365,110,1991,208,211,138,671,1


In [5]:
df3.columns = ['x_'+str(i) for i in range(len(df3.columns))]
df3.rename({'x_0': 'Target'},  axis='columns', inplace=True)
df3['Target'] = df3.Target.apply(lambda r: 1 if r=='neg' else -1)

col3 = ['x_'+str(i) for i in range(1,171)]
df3

Unnamed: 0,Target,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_161,x_162,x_163,x_164,x_165,x_166,x_167,x_168,x_169,x_170
16,1,31300,0.0,784.0,740.0,0.0,0.0,0.0,0.0,0.0,...,798872.0,112724.0,51736.0,7054.0,6628.0,27600.0,2.0,2.0,0.0,0.0
179,1,97000,0.0,378.0,160.0,0.0,0.0,0.0,0.0,0.0,...,1078982.0,313334.0,511330.0,552328.0,871528.0,871104.0,1980.0,42.0,0.0,0.0
225,1,124656,2.0,278.0,170.0,0.0,0.0,0.0,0.0,0.0,...,1205696.0,866148.0,697610.0,700400.0,1900386.0,437532.0,3680.0,0.0,0.0,0.0
394,-1,281324,2.0,3762.0,2346.0,0.0,0.0,4808.0,215720.0,967572.0,...,624606.0,269976.0,638838.0,1358354.0,819918.0,262804.0,2824.0,0.0,0.0,0.0
413,-1,43482,0.0,1534.0,1388.0,0.0,0.0,0.0,0.0,40024.0,...,497196.0,121166.0,202272.0,232636.0,645690.0,50.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59432,1,118028,0.0,740.0,714.0,618.0,690.0,0.0,0.0,0.0,...,838952.0,631338.0,541036.0,1285274.0,1832658.0,165838.0,3022.0,0.0,0.0,0.0
59562,1,229916,0.0,616.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59843,1,224084,0.0,912.0,766.0,0.0,0.0,0.0,0.0,0.0,...,413576.0,209524.0,469894.0,2233992.0,5933084.0,364450.0,12422.0,0.0,0.0,0.0
59870,1,197332,0.0,658.0,616.0,216.0,346.0,0.0,0.0,0.0,...,73940.0,49896.0,90454.0,575264.0,104600.0,10352.0,36.0,0.0,0.0,0.0


In [6]:
def Outlier_Detection(df, clf, features, test_size=0.33):
    
    '''
    This function fit the clf model on 66% sample of df and then predicts the results for the last 33% testing set.
    It outputs the percentage of normal and outlier points in the testing set and the misclassification for normal, 
    outliers and general.
    '''
    
    X_train, X_test, y_train, y_test = train_test_split(df[features], df['Target'], test_size=test_size, random_state=42)   

    clf.fit(X_train)

    y_pred = clf.predict(X_test)

    nb_outliers = y_test[y_test==-1].size
    diff = y_test - y_pred

    refused_normal = 100 * (diff[diff == 2].size) / len(diff)
    accepted_outliers = 100 * (diff[diff == -2].size) / len(diff)
    error = refused_normal+accepted_outliers

    print('Normal   = %.2f %% - Refused Normal    = %.2f %%' % (100*(1-nb_outliers/len(y_test)), refused_normal))
    print('Outliers = %.2f %% - Accepted Outliers = %.2f %%' % (100*nb_outliers/len(y_test), accepted_outliers))
    print()
    print('Miss Classification error = %.2f %% - Accuracy = %.2f %%' % (error,100-error))

In [7]:
# Breast Cancer

print('* Linear : \n')
clf = OneClassSVM(nu=0.25, kernel='linear')
Outlier_Detection(df1, clf, col1, 0.2)

print()

print('* Polynomial : \n')
clf = OneClassSVM(nu=0.25, degree=3, kernel='poly')
Outlier_Detection(df1, clf, col1, 0.2)

print()

print('* RBF : \n')
clf = OneClassSVM(nu=0.25, gamma=0.1, kernel='rbf')
Outlier_Detection(df1, clf, col1, 0.2)

* Linear : 

Normal   = 63.50 % - Refused Normal    = 24.82 %
Outliers = 36.50 % - Accepted Outliers = 36.50 %

Miss Classification error = 61.31 % - Accuracy = 38.69 %

* Polynomial : 

Normal   = 63.50 % - Refused Normal    = 28.47 %
Outliers = 36.50 % - Accepted Outliers = 36.50 %

Miss Classification error = 64.96 % - Accuracy = 35.04 %

* RBF : 

Normal   = 63.50 % - Refused Normal    = 8.03 %
Outliers = 36.50 % - Accepted Outliers = 3.65 %

Miss Classification error = 11.68 % - Accuracy = 88.32 %


In [8]:
# Tuning hyper-parameters using N_folds Cross Validation
X_train, X_test, y_train, y_test = train_test_split(df1[col1], df1['Target'], test_size=0.2, random_state=42)    
    
gamma = [0.5, 0.1, 0.01, 0.001, 0.0001]
nu    = [0.1, 0.25, 0.5, 0.75, 0.95]

inputs = list(itertools.product(gamma, nu))

def processInput(gamma,nu):
    clf = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu)
    return np.mean(cross_val_score(clf, X_train, y_train, cv=10, scoring=make_scorer(zero_one_loss)))
 
num_cores = multiprocessing.cpu_count()-2
     
results = Parallel(n_jobs=num_cores)(delayed(processInput)(gamma,nu) for gamma,nu in inputs)

In [9]:
# Saving the results
p_tab = pd.DataFrame(columns=['gamma','nu','miss'])

for i in range(0,len(results)):
    p_tab.loc[len(p_tab)] = [inputs[i][0], inputs[i][1], results[i]]
    
p_tab.sort_values(by='miss')

Unnamed: 0,gamma,nu,miss
5,0.1,0.1,0.10266
6,0.1,0.25,0.108148
7,0.1,0.5,0.17064
12,0.01,0.5,0.194276
11,0.01,0.25,0.212896
2,0.5,0.5,0.247576
1,0.5,0.25,0.251212
0,0.5,0.1,0.265993
10,0.01,0.1,0.269832
16,0.001,0.25,0.291313


In [10]:
# Best hyper-parameters
best_cv = p_tab.loc[p_tab['miss'].idxmin()]
print(best_cv)

gamma    0.10000
nu       0.10000
miss     0.10266
Name: 5, dtype: float64


In [11]:
# Fitting the best model
print('* RBF : \n')
clf = OneClassSVM(gamma=best_cv[0], nu=best_cv[1], kernel='rbf')
Outlier_Detection(df1, clf, col1, 0.2)

* RBF : 

Normal   = 63.50 % - Refused Normal    = 7.30 %
Outliers = 36.50 % - Accepted Outliers = 2.92 %

Miss Classification error = 10.22 % - Accuracy = 89.78 %


In [8]:
# Forest Cover -> too slow

# Standarize suggested by sklearn method
df2 = df2.apply(lambda c: c if c.name=='Target' else scale(c))

df2

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,Target
0,-0.556817,-0.120173,-0.644160,-0.048728,0.333207,0.472794,0.802068,0.687445,-0.210405,2.784753,1
1,-0.652783,0.028680,0.616617,-0.172269,1.254874,0.417172,0.962351,0.687445,-0.566556,2.847957,1
2,-1.693256,-0.185297,-1.064420,0.103321,-1.057989,-1.451083,0.641785,0.633542,-0.073424,2.721550,1
3,-0.142649,-0.008533,-0.363988,0.440681,-0.345001,1.753936,0.802068,0.795251,-0.183009,1.331075,1
4,-0.869969,-0.166690,1.176963,-0.609411,0.402767,0.494424,1.363058,-0.067196,-1.388443,2.763685,1
...,...,...,...,...,...,...,...,...,...,...,...
286043,-1.501324,-1.143540,-0.083815,0.530961,1.428773,-0.206403,0.040724,-0.767933,-0.347387,-1.098042,1
286044,-1.516477,-1.217966,-0.083815,0.478694,1.376604,-0.224944,-0.119558,-0.714030,-0.210405,-1.079784,1
286045,-1.526578,-1.255180,-0.083815,0.440681,1.341824,-0.243484,-0.239770,-0.767933,-0.128217,-1.061525,1
286046,-1.536680,-1.264483,0.056271,0.412172,1.115754,-0.262025,-0.239770,-0.767933,-0.128217,-1.042564,1


In [9]:
print('* Linear : \n')
clf = OneClassSVM(nu=0.25, kernel='linear', max_iter=100)
Outlier_Detection(df2, clf, col2, 0.2)

print()

print('* Polynomial : \n')
clf = OneClassSVM(nu=0.25, degree=3, kernel='poly', max_iter=100)
Outlier_Detection(df2, clf, col2, 0.2)

print()

print('* RBF : \n')
clf = OneClassSVM(nu=0.25, gamma=0.1, kernel='rbf', max_iter=100)
Outlier_Detection(df2, clf, col2, 0.2)

* Linear : 





Normal   = 98.95 % - Refused Normal    = 88.41 %
Outliers = 1.05 % - Accepted Outliers = 0.00 %

Miss Classification error = 88.41 % - Accuracy = 11.59 %

* Polynomial : 





Normal   = 98.95 % - Refused Normal    = 42.02 %
Outliers = 1.05 % - Accepted Outliers = 1.01 %

Miss Classification error = 43.03 % - Accuracy = 56.97 %

* RBF : 





Normal   = 98.95 % - Refused Normal    = 46.90 %
Outliers = 1.05 % - Accepted Outliers = 0.00 %

Miss Classification error = 46.90 % - Accuracy = 53.10 %


In [None]:
# APS Failure -> too slow

# Standarize suggested by sklearn method -> but errors too large values rescale impossible
df3 = df3.apply(lambda c: c if c.name=='Target' else scale(c))

df3

In [None]:
print('* Linear : \n')
clf = OneClassSVM(nu=0.25, kernel='linear')
Outlier_Detection(df3, clf, col3, 0.2, max_iter=100)

print()

print('* Polynomial : \n')
clf = OneClassSVM(nu=0.25, degree=3, kernel='poly')
Outlier_Detection(df3, clf, col3, 0.2, max_iter=100)

print()

print('* RBF : \n')
clf = OneClassSVM(nu=0.25, gamma=0.1, kernel='rbf')
Outlier_Detection(df3, clf, col3, 0.2, max_iter=100)