In [1]:
import random
import math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.animation import FuncAnimation, PillowWriter
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, KFold, cross_val_score

%matplotlib inline

In [2]:
# Loading the data
df = pd.read_csv('data/mulcross.csv')

df['Target'] = df.Target.apply(lambda r: 1 if r=="'Normal'" else -1)
df

Unnamed: 0,V1,V2,V3,V4,Target
0,-0.203950,0.363011,1.013766,0.187131,1
1,-0.761118,2.436424,0.681846,0.654366,1
2,-0.209979,1.131098,-0.282180,-0.202210,1
3,0.836812,0.650342,-0.426900,-0.305281,1
4,0.454204,1.560128,-0.204841,0.219233,1
...,...,...,...,...,...
262139,-0.699533,-1.110000,-0.084369,-0.575921,1
262140,-1.972119,0.965250,-0.240917,0.463352,1
262141,-0.327949,-1.702591,-0.031113,-0.759623,1
262142,-2.080999,-0.185966,-0.518695,-0.829814,1


In [3]:
def split_set(data_to_split, ratio=0.8):
    mask = np.random.rand(len(data_to_split)) < ratio
    return [data_to_split[mask].reset_index(drop=True), data_to_split[~mask].reset_index(drop=True)]

[train, test] = split_set(df.sample(frac=0.1, random_state=1))

In [7]:
from sklearn.svm import OneClassSVM

features = ['V1','V2','V3','V4']

#clf = OneClassSVM(kernel='poly', degree=3, gamma='scale')
clf = OneClassSVM(kernel='rbf', nu=0.25, gamma=0.001)

clf.fit(train[features])

train['Pred'] = clf.predict(train[features])
test['Pred']  = clf.predict(test[features])

In [8]:
# Errors
#n_error_train = len(train.loc[(train['Target']==1) & (train['Target']!=train['Pred'])])
error_train = len(train.loc[train['Target']!=train['Pred']])/len(train)
error_regular_test  = len(test.loc[(test['Target']==1) & (test['Target']!=test['Pred'])])/len(test.loc[(test['Target']==1)])
error_outliers_test = len(test.loc[(test['Target']==-1) & (test['Target']!=test['Pred'])])/len(test.loc[(test['Target']==-1)])

In [9]:
print('ETr %.2f %% ERTe %.2f %% EOTe %.2f %%' % 
      (100*error_train, 100*error_regular_test, 100*error_outliers_test))

ETr 15.04 % ERTe 17.61 % EOTe 0.00 %


In [13]:
# N-Folds Cross Validation
def N_folds_CV(X, y, n, model):
    
    """
    Performs N-folds cross-validation on the data (predictors X, observed output y) with a given model.
    
    :param X: Predictors
    :param y: Observed output
    :param n: number of folds 
    :param model: model to fit
    
    :return: array of N R-squared scores
    """
    
    cv = KFold(n_splits=n)
    cv_clf = model
    cv_score = []
            
    for train, test in cv.split(X, y):
        cv_reg = cv_clf.fit(X.iloc[train])
        y_diff = cv_reg.predict(X.iloc[train]) - y.iloc[train]
        cv_score.append(y_diff[y_diff != 0].size)
                
    return np.asarray(cv_score)

In [53]:
import itertools
from joblib import Parallel, delayed
import multiprocessing
    
df_s = df.sample(frac=0.01, random_state=1) # too long calculation

# Tuning hyper-parameters using N_folds Cross Validation
X_train, X_test, y_train, y_test = train_test_split(df_s[features], df_s['Target'], test_size=0.33, random_state=42)    
    
gamma = [0.1, 0.01, 0.001, 0.0001]
nu    = [0.25, 0.5, 0.75, 0.95]

inputs = list(itertools.product(gamma, nu))

def processInput(gamma,nu):
    return N_folds_CV(X_train, y_train, 10, OneClassSVM(kernel='rbf', gamma=gamma, nu=nu)).mean()
 
num_cores = multiprocessing.cpu_count()-2
     
results = Parallel(n_jobs=num_cores)(delayed(processInput)(gamma,nu) for gamma,nu in inputs)

In [54]:
# Saving the results
p_tab = pd.DataFrame(columns=['gamma','nu','miss'])

for i in range(0,len(results)):
    p_tab.loc[len(p_tab)] = [inputs[i][0], inputs[i][1], results[i]]
    
p_tab

Unnamed: 0,gamma,nu,miss
0,0.1,0.25,246.5
1,0.1,0.5,641.5
2,0.1,0.75,1036.7
3,0.1,0.95,1353.0
4,0.01,0.25,246.3
5,0.01,0.5,641.9
6,0.01,0.75,1036.9
7,0.01,0.95,1353.2
8,0.001,0.25,246.7
9,0.001,0.5,641.8


In [55]:
best_cv = p_tab.loc[p_tab['miss'].idxmin()]
print(best_cv)

gamma      0.0001
nu         0.2500
miss     246.1000
Name: 12, dtype: float64


In [56]:
clf = OneClassSVM(kernel='rbf', gamma=best_cv[0], nu=best_cv[1])
clf.fit(X_train)

y_pred = clf.predict(X_test)

error = (y_test - y_pred)
error = 100 * (error[error != 0].size / len(error))

In [57]:
print('Miss_classification error = %.2f %%' % error)

Miss_classification error = 15.49 %


In [58]:
# Testing on all the data
X_train, X_test, y_train, y_test = train_test_split(df[features], df['Target'], test_size=0.33, random_state=42)   

clf = OneClassSVM(kernel='rbf', gamma=best_cv[0], nu=best_cv[1])
clf.fit(X_train)

y_pred = clf.predict(X_test)

error = (y_test - y_pred)
error = 100 * (error[error != 0].size / len(error))

In [59]:
print('Miss_classification error = %.2f %%' % error)

Miss_classification error = 14.89 %


In [89]:
accepted_outliers = y_test - y_pred
accepted_outliers = 100 * (accepted_outliers[accepted_outliers == -2].size) / len(accepted_outliers)

print('Accepted Outliers = %.2f %%' % accepted_outliers)

Accepted Outliers = 0.00 %
