In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import math

In [2]:
#load database

dataset = pd.read_csv('DiffData_classification_noise.csv')#.dropna()

# print (dataset.head(20)) #head of the data file with 20 rows
# print (dataset.tail(20))
print (dataset.describe().T) #simple statistics like mean, std, percentile

# edit database as needed
# reduce set of importance, rescale values etc...
df = dataset.copy()
df["target"] = [1 if label == "Dgb" else 0 for label in df["Diffusion_type"]]

# setup xdata which is input and y data which is target
#xInput = df.drop(["target"], axis = 1)
xInput = df[["distance","time"]]
yTarget = df[["target"]]

# print(xInput.head(10))
# print(yTarget.head(10))


          count        mean         std       min         25%         50%  \
time      980.0  300.000000  173.647540  0.000000  149.693252  300.000000   
distance  980.0    0.028601    0.016163 -0.000355    0.015133    0.026372   

                 75%         max  
time      450.306748  600.000000  
distance    0.041898    0.064679  


In [3]:
# split data for training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(xInput,yTarget, test_size=0.2, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

from sklearn.svm import SVC
# Create a baseline SVM classifier
baseline_svm = SVC()
baseline_svm.fit(x_train, y_train.values.ravel())
y_pred = baseline_svm.predict(x_test)

from sklearn.metrics import confusion_matrix, accuracy_score,classification_report 
print ("\nConfusion Matrix:\n",confusion_matrix(y_test, y_pred))
print('\n')
print ("accuracy: ",accuracy_score(y_test, y_pred))
print('\n')
print(classification_report(y_test,y_pred))


#get CV score
from sklearn.model_selection import cross_val_score

scores = cross_val_score(baseline_svm, x_train, y_train.values.ravel(), scoring='accuracy', cv=5)
print(scores)
print("Final CV Score: {:5.2f}".format(np.mean(scores)))



Confusion Matrix:
 [[81 17]
 [ 0 98]]


accuracy:  0.9132653061224489


              precision    recall  f1-score   support

           0       1.00      0.83      0.91        98
           1       0.85      1.00      0.92        98

    accuracy                           0.91       196
   macro avg       0.93      0.91      0.91       196
weighted avg       0.93      0.91      0.91       196

[0.92356688 0.91719745 0.93630573 0.89808917 0.91025641]
Final CV Score:  0.92


In [4]:
# split data for training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(xInput,yTarget, test_size=0.2, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

from sklearn.svm import SVC
# Create a baseline SVM classifier
baseline_svm = SVC()
baseline_svm.fit(x_train, y_train.values.ravel())
y_pred = baseline_svm.predict(x_test)

In [5]:
# hyper parameter tuning # grid
from sklearn.model_selection import GridSearchCV
# define the hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [0.1, 1, 'scale', 'auto']
}

# create the GridSearchCV object
grid_search = GridSearchCV(estimator = baseline_svm, param_grid = param_grid, cv = 5, verbose = 1)

# fit the model with the grid of hyperparameters
grid_search.fit(x_train, y_train.values.ravel())

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [6]:
from sklearn.metrics import accuracy_score

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(x_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

print(f"Best SVM Accuracy: {accuracy_best:.2f}")
print(f"Best Hyperparameters: {best_params}")


Best SVM Accuracy: 0.93
Best Hyperparameters: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}


In [7]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.svm import SVC
import numpy as np

# Define the hyperparameter distribution
param_dist = {
    'C': uniform(0.1, 10),  # Uniform distribution between 0.1 and 10
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'] + list(np.logspace(-3, 3, 5))
}

# Create the RandomizedSearchCV object
randomized_search = RandomizedSearchCV(estimator=SVC(), param_distributions=param_dist, n_iter=18, cv=5, verbose=3)

# Fit the model with the randomized grid of hyperparameters
randomized_search.fit(x_train, y_train.values.ravel())


Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END C=1.4073512554586243, gamma=auto, kernel=rbf;, score=0.936 total time=   0.0s
[CV 2/5] END C=1.4073512554586243, gamma=auto, kernel=rbf;, score=0.904 total time=   0.0s
[CV 3/5] END C=1.4073512554586243, gamma=auto, kernel=rbf;, score=0.936 total time=   0.0s
[CV 4/5] END C=1.4073512554586243, gamma=auto, kernel=rbf;, score=0.898 total time=   0.0s
[CV 5/5] END C=1.4073512554586243, gamma=auto, kernel=rbf;, score=0.910 total time=   0.0s
[CV 1/5] END C=1.5437226368137391, gamma=1000.0, kernel=rbf;, score=0.771 total time=   0.0s
[CV 2/5] END C=1.5437226368137391, gamma=1000.0, kernel=rbf;, score=0.777 total time=   0.0s
[CV 3/5] END C=1.5437226368137391, gamma=1000.0, kernel=rbf;, score=0.803 total time=   0.0s
[CV 4/5] END C=1.5437226368137391, gamma=1000.0, kernel=rbf;, score=0.796 total time=   0.0s
[CV 5/5] END C=1.5437226368137391, gamma=1000.0, kernel=rbf;, score=0.776 total time=   0.0s
[CV 1/5] END C=9.61

In [8]:
from sklearn.metrics import accuracy_score

# Get the best hyperparameters and model
best_params_rand = randomized_search.best_params_
best_model_rand = randomized_search.best_estimator_

# Evaluate the best model
y_pred_best_rand = best_model_rand.predict(x_test)
accuracy_best_rand = accuracy_score(y_test, y_pred_best_rand)

print(f"Best SVM Accuracy: {accuracy_best_rand:.2f}")
print(f"Best Hyperparameters: {best_params_rand}")


Best SVM Accuracy: 0.91
Best Hyperparameters: {'C': 2.695563349892356, 'gamma': 'scale', 'kernel': 'rbf'}
