In [1]:
import random
import math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, zero_one_loss
from sklearn.preprocessing import scale

%matplotlib inline

In [2]:
# Loading the data
df1 = pd.read_csv('data/cancer.csv')

df2 = pd.read_csv('data/cover.csv')

df3 = pd.read_csv('data/aps_failure.csv')

In [3]:
df1.drop('1000025', axis=1, inplace=True)
df1.columns = ['x_'+str(i) for i in range(1,len(df1.columns)+1)]
df1.rename({'x_10': 'Target'},  axis='columns', inplace=True)
df1['Target'] = df1.Target.apply(lambda r: 1 if r==2 else -1)

col1 = ['x_'+str(i) for i in range(1,10)]
df1

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,Target
0,5,4,4,5,7,10.0,3,2,1,1
1,3,1,1,1,2,2.0,3,1,1,1
2,6,8,8,1,3,4.0,3,7,1,1
3,4,1,1,3,2,1.0,3,1,1,1
4,8,10,10,8,7,10.0,9,7,1,-1
...,...,...,...,...,...,...,...,...,...,...
693,3,1,1,1,3,2.0,1,1,1,1
694,2,1,1,1,2,1.0,1,1,1,1
695,5,10,10,3,7,3.0,8,10,2,-1
696,4,8,6,4,3,4.0,10,6,1,-1


In [4]:
df2.rename({'y': 'Target'},  axis='columns', inplace=True)
df2['Target'] = df2.Target.apply(lambda r: 1 if r==0 else -1)

col2 = ['x_'+str(i) for i in range(1,11)]
df2

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,Target
0,2804,139,9,268,65,3180,234,238,135,6121,1
1,2785,155,18,242,118,3090,238,238,122,6211,1
2,2579,132,6,300,-15,67,230,237,140,6031,1
3,2886,151,11,371,26,5253,234,240,136,4051,1
4,2742,134,22,150,69,3215,248,224,92,6091,1
...,...,...,...,...,...,...,...,...,...,...,...
286043,2617,29,13,390,128,2081,215,211,130,592,1
286044,2614,21,13,379,125,2051,211,212,135,618,1
286045,2612,17,13,371,123,2021,208,211,138,644,1
286046,2610,16,14,365,110,1991,208,211,138,671,1


In [5]:
df3.columns = ['x_'+str(i) for i in range(len(df3.columns))]
df3.rename({'x_0': 'Target'},  axis='columns', inplace=True)
df3['Target'] = df3.Target.apply(lambda r: 1 if r=='neg' else -1)

col3 = ['x_'+str(i) for i in range(1,171)]
df3

Unnamed: 0,Target,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_161,x_162,x_163,x_164,x_165,x_166,x_167,x_168,x_169,x_170
0,1,76698,,2.130706e+09,280.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,1,33058,,0.000000e+00,,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,1,41040,,2.280000e+02,100.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,1,12,0.0,7.000000e+01,66.0,0.0,10.0,0.0,0.0,0.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,1,60874,,1.368000e+03,458.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,1,153002,,6.640000e+02,186.0,0.0,0.0,0.0,0.0,0.0,...,998500.0,566884.0,1290398.0,1218244.0,1019768.0,717762.0,898642.0,28588.0,0.0,0.0
59996,1,2286,,2.130707e+09,224.0,0.0,0.0,0.0,0.0,0.0,...,10578.0,6760.0,21126.0,68424.0,136.0,0.0,0.0,0.0,0.0,0.0
59997,1,112,0.0,2.130706e+09,18.0,0.0,0.0,0.0,0.0,0.0,...,792.0,386.0,452.0,144.0,146.0,2622.0,0.0,0.0,0.0,0.0
59998,1,80292,,2.130706e+09,494.0,0.0,0.0,0.0,0.0,0.0,...,699352.0,222654.0,347378.0,225724.0,194440.0,165070.0,802280.0,388422.0,0.0,0.0


In [8]:
def Outlier_Detection(df, clf, features):
    
    '''
    This function fit the clf model on 66% sample of df and then predicts the results for the last 33% testing set.
    It outputs the percentage of normal and outlier points in the testing set and the misclassification for normal, 
    outliers and general.
    '''
    
    X_train, X_test, y_train, y_test = train_test_split(df[features], df['Target'], test_size=0.33, random_state=42)   

    clf.fit(X_train)

    y_pred = clf.predict(X_test)

    nb_outliers = y_test[y_test==-1].size
    diff = y_test - y_pred

    refused_normal = 100 * (diff[diff == 2].size) / len(diff)
    accepted_outliers = 100 * (diff[diff == -2].size) / len(diff)
    error = refused_normal+accepted_outliers

    print('Normal   = %.2f %% - Refused Normal    = %.2f %%' % (100*(1-nb_outliers/len(y_test)), refused_normal))
    print('Outliers = %.2f %% - Accepted Outliers = %.2f %%' % (100*nb_outliers/len(y_test), accepted_outliers))
    print()
    print('Miss Classification error = %.2f %% - Accuracy = %.2f %%' % (error,100-error))

In [9]:
# Breast Cancer
df1.dropna(inplace=True)

print('* Linear : \n')
clf = OneClassSVM(nu=0.25, kernel='linear')
Outlier_Detection(df1, clf, col1)

print()

print('* Polynomial : \n')
clf = OneClassSVM(nu=0.25, degree=3, kernel='poly')
Outlier_Detection(df1, clf, col1)

print()

print('* RBF : \n')
clf = OneClassSVM(nu=0.25, gamma=0.1, kernel='rbf')
Outlier_Detection(df1, clf, col1)

* Linear : 

Normal   = 66.81 % - Refused Normal    = 33.19 %
Outliers = 33.19 % - Accepted Outliers = 33.19 %

Miss Classification error = 66.37 % - Accuracy = 33.63 %

* Polynomial : 

Normal   = 66.81 % - Refused Normal    = 31.42 %
Outliers = 33.19 % - Accepted Outliers = 33.19 %

Miss Classification error = 64.60 % - Accuracy = 35.40 %

* RBF : 

Normal   = 66.81 % - Refused Normal    = 8.41 %
Outliers = 33.19 % - Accepted Outliers = 2.21 %

Miss Classification error = 10.62 % - Accuracy = 89.38 %


In [None]:
# Forest Cover -> too slow

df2 = df2.apply(lambda c: scale(c) if c.name!='Target' else c)

print('* Linear : \n')
clf = OneClassSVM(nu=0.25, kernel='linear', max_iter=100)
Outlier_Detection(df2, clf, col2)

print()

print('* Polynomial : \n')
clf = OneClassSVM(nu=0.25, degree=3, kernel='poly')
Outlier_Detection(df2, clf, col2)

print()

print('* RBF : \n')
clf = OneClassSVM(nu=0.25, gamma=0.1, kernel='rbf', max_iter=1)
Outlier_Detection(df2, clf, col2)

In [None]:
# APS Failure -> too slow

df3 = df3.apply(lambda c: scale(c) if c.name!='Target' else c)

print('* Linear : \n')
clf = OneClassSVM(nu=0.25, kernel='linear')
Outlier_Detection(df3, clf, col3)

print()

print('* Polynomial : \n')
clf = OneClassSVM(nu=0.25, degree=3, kernel='poly')
Outlier_Detection(df3, clf, col3)

print()

print('* RBF : \n')
clf = OneClassSVM(nu=0.25, gamma=0.1, kernel='rbf')
Outlier_Detection(df3, clf, col3)