<font color='blue'>The Preprocess Operations:</font> 

In [2]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import ShuffleSplit

# Read Data and split them to Train and Test sets
Xy = pd.read_csv("Data.csv")

#Generate a smaller resample of dataset
#Xy = resample(Xy, n_samples=10000, random_state=0)

#find out if there is any Inf value and then replcae them with NaN
#print(np.where(X.values[738] >= np.finfo(np.float64).max)) 
Xy = Xy.replace([np.inf, -np.inf], -1) 

#Find out if there is any NaN in the dataframe and then remove it and reset the indexes 
#print(np.where(np.isnan(X))) 
Xy = Xy.dropna()
Xy = Xy.reset_index(drop=True)

#Separate examples and labels
y = Xy['marker']
X = Xy.drop(labels='marker', axis=1) #Removes the marker column from the dataframe
print('The Number of Examples: ', X.shape[0])
print('The Number of Features: ', X.shape[1])

#dreate sclaed data
scaler = StandardScaler()
scaledX = scaler.fit_transform(X)
scaledy = y

#Create oversampled data
attackClass = Xy[Xy['marker'] == 'Attack']  # separate classes
noEvClass = Xy[Xy['marker'] == 'NoEvents']  # separate classes
naturalClass = Xy[Xy['marker'] == 'Natural']  # separate classes
oversampledNoEvClass = resample(noEvClass, replace=True, n_samples=attackClass.shape[0], random_state=27)
overampledNaturalClass = resample(naturalClass, replace=True, n_samples=attackClass.shape[0], random_state=27)
oversampledXy = pd.concat([pd.concat([attackClass, oversampledNoEvClass]), overampledNaturalClass])
oversampledy = oversampledXy['marker']
oversampledX = oversampledXy.drop(labels='marker', axis=1) #Removes the marker column from the dataframe

#Create subsampled data
attackClass = Xy[Xy['marker'] == 'Attack']  # separate classes
noEvClass = Xy[Xy['marker'] == 'NoEvents']  # separate classes
naturalClass = Xy[Xy['marker'] == 'Natural']  # separate classes
subsamplesAttackClass = resample(attackClass, replace=True, n_samples=naturalClass.shape[0], random_state=27)
subsampledXy = pd.concat([pd.concat([subsamplesAttackClass, noEvClass]), naturalClass])
subsampledy = subsampledXy['marker']
subsampledX = subsampledXy.drop(labels='marker', axis=1) #Removes the marker column from the dataframe

#Create scaled oversampled data
scaler = StandardScaler()
scaledOversampledX = scaler.fit_transform(oversampledX)
scaledOversampledy = oversampledy

#Create scaled undersampled data
scaler = StandardScaler()
scaledSubsampledX = scaler.fit_transform(subsampledX)
scaledSubsampledy = subsampledy

#Model Initialization
crossNumber = 2     #The number of folds in cross validation
kFold = ShuffleSplit(n_splits=crossNumber, test_size=0.3, random_state=0) #the split methos in cross validation

The Number of Examples:  78377
The Number of Features:  128


In [3]:
print(oversampledX.shape)

(166989, 128)


<font color='blue'>Using unscaled data and SVC method:</font> 

In [5]:
#Scenario 1
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit

# Intitialization
score1 = np.array([])

#The Classifier
Cl = SVC(kernel='rbf')

#Cross Validation
output1 = cross_validate(Cl, X, y, cv=kFold, scoring='f1_macro', return_estimator=True)

#Calculating the scores and show the results
score1 = np.append(score1, output1['test_score'].mean())
print('The Mean Fit Time: ', np.mean(output1['fit_time']))
print('The Mean Score Time: ', np.mean(output1['score_time']))
print('The averaged F1 Score: ', np.mean(output1['test_score']))



The Mean Fit Time:  4096.359447002411
The Mean Score Time:  231.83962523937225
The averaged F1 Score:  0.29416603541778796


<font color='blue'>SVC method, Scaled data:</font> 

In [7]:
#Scenario 2
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

# Intitialization
score2 = np.array([])

#The Classifier
Cl = SVC(kernel='rbf')

#Cross Validation
output2 = cross_validate(Cl, scaledX, scaledy, cv=kFold, scoring='f1_macro', return_estimator=True)

#Calculating the scores and show the results
score2 = np.append(score2, output2['test_score'].mean())
print('The Mean Fit Time: ', np.mean(output2['fit_time']))
print('The Mean Score Time: ', np.mean(output2['score_time']))
print('The averaged F1 Score: ', np.mean(output2['test_score']))



The Mean Fit Time:  694.760036110878
The Mean Score Time:  108.71733283996582
The averaged F1 Score:  0.3395184594310598


<font color='blue'> SVC Method, Scaling, Feature Selection (L1 norm)</font> 

In [8]:
#Scenario 3
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

# Intitialization
score3 = np.array([])

#Feature Selection (Sparse Features)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(scaledX, scaledy)
model = SelectFromModel(lsvc, prefit=True)
fSelX = model.transform(scaledX)          #feature selected X
fSely = scaledy
print('\nThe numer of features after feture sparsing: ',fSelX.shape[1])

#The Classifier
Cl = SVC(kernel='rbf')

#Cross Validation
output3 = cross_validate(Cl, fSelX, fSely, cv=kFold, scoring='f1_macro', return_estimator=True)

#Calculating the scores and show the results
score3 = np.append(score3, output3['test_score'].mean())
print('The Mean Fit Time: ', np.mean(output3['fit_time']))
print('The Mean Score Time: ', np.mean(output3['score_time']))
print('The averaged F1 Score: ', np.mean(output3['test_score']))




The numer of features after feture sparsing:  91




The Mean Fit Time:  521.1354594230652
The Mean Score Time:  76.78919219970703
The averaged F1 Score:  0.3421889255632894


<font color='blue'> SVC Method, Scaling, Feature Selection (L1 norm), Oversampling</font> 

In [9]:
#Scenario 4
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

# Intitialization
score4 = np.array([])

#Feature Selection (Sparse Features)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(scaledOversampledX, scaledOversampledy)
model = SelectFromModel(lsvc, prefit=True)
fSelX = model.transform(scaledOversampledX)          #feature selected X
fSely = scaledOversampledy
print('\nThe numer of features after feture sparsing: ',fSelX.shape[1])

#The Classifier
Cl = SVC(kernel='rbf')

#Cross Validation
output4 = cross_validate(Cl, fSelX, fSely, cv=kFold, scoring='f1_macro', return_estimator=True)

#Calculating the scores and show the results
score4 = np.append(score4, output4['test_score'].mean())
print('The Mean Fit Time: ', np.mean(output4['fit_time']))
print('The Mean Score Time: ', np.mean(output4['score_time']))
print('The averaged F1 Score: ', np.mean(output4['test_score']))




The numer of features after feture sparsing:  117




The Mean Fit Time:  2436.443232178688
The Mean Score Time:  510.4253033399582
The averaged F1 Score:  0.6853253666723588


<font color='blue'> SVC Method, Scaling, Feature Selection (L1 norm), Weigh the classes</font> 

In [11]:
#Scenario 5
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

# Intitialization
score5 = np.array([])

#Feature Selection (Sparse Features)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(scaledX, scaledy)
model = SelectFromModel(lsvc, prefit=True)
fSelX = model.transform(scaledX)          #feature selected X
fSely = scaledy
print('\nThe numer of features after feture sparsing: ',fSelX.shape[1])

#The Classifier
Cl = SVC(kernel='rbf', class_weight='balanced')

#Cross Validation
output5 = cross_validate(Cl, fSelX, fSely, cv=kFold, scoring='f1_macro', return_estimator=True)

#Calculating the scores and show the results
score5 = np.append(score5, output5['test_score'].mean())
print('The Mean Fit Time: ', np.mean(output5['fit_time']))
print('The Mean Score Time: ', np.mean(output5['score_time']))
print('The averaged F1 Score: ', np.mean(output5['test_score']))




The numer of features after feture sparsing:  88




The Mean Fit Time:  590.6965724229813
The Mean Score Time:  108.55951690673828
The averaged F1 Score:  0.5274465321053812
