<a href="https://colab.research.google.com/github/KORIAnfal/Harth/blob/main/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SVM

## Data Preprocessing

In [1]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import seaborn as sns
from joblib import dump, load
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [2]:
#combine all datasets in one dataframe
csv_files = glob.glob('*.csv')
dfs = []

for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6117434 entries, 0 to 6117433
Data columns (total 10 columns):
 #   Column      Dtype  
---  ------      -----  
 0   timestamp   object 
 1   back_x      float64
 2   back_y      float64
 3   back_z      float64
 4   thigh_x     float64
 5   thigh_y     float64
 6   thigh_z     float64
 7   label       int64  
 8   Unnamed: 0  float64
 9   index       float64
dtypes: float64(8), int64(1), object(1)
memory usage: 466.7+ MB


In [3]:
df.shape

(6117434, 10)

In [4]:
df.drop(columns=['Unnamed: 0','index'], inplace=True)
df.head()

Unnamed: 0,timestamp,back_x,back_y,back_z,thigh_x,thigh_y,thigh_z,label
0,2019-01-12 00:00:00.000,-0.306885,-0.724121,-0.303223,-4.640381,-0.52124,-1.580811,6
1,2019-01-12 00:00:00.020,-0.97998,-0.288574,-0.335693,1.295166,-1.83667,-0.389893,6
2,2019-01-12 00:00:00.040,-1.36792,0.059814,-0.255371,-0.798584,-0.893799,0.170898,6
3,2019-01-12 00:00:00.060,-0.915039,-0.089355,-0.291016,-1.01001,0.139648,0.00293,6
4,2019-01-12 00:00:00.080,-0.539795,-0.039307,-0.209717,-1.092529,0.860596,0.0,6


In [5]:
# split timestamp attribute
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# Extract year, month, day, hour, minute, second, and millisecond into separate columns
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['hour'] = df['timestamp'].dt.hour
df['minute'] = df['timestamp'].dt.minute
df['second'] = df['timestamp'].dt.second
df['millisecond'] = df['timestamp'].dt.microsecond // 1000

df.drop(columns=['timestamp'], inplace=True)

df.head()


Unnamed: 0,back_x,back_y,back_z,thigh_x,thigh_y,thigh_z,label,year,month,day,hour,minute,second,millisecond
0,-0.306885,-0.724121,-0.303223,-4.640381,-0.52124,-1.580811,6,2019,1,12,0,0,0,0
1,-0.97998,-0.288574,-0.335693,1.295166,-1.83667,-0.389893,6,2019,1,12,0,0,0,20
2,-1.36792,0.059814,-0.255371,-0.798584,-0.893799,0.170898,6,2019,1,12,0,0,0,40
3,-0.915039,-0.089355,-0.291016,-1.01001,0.139648,0.00293,6,2019,1,12,0,0,0,60
4,-0.539795,-0.039307,-0.209717,-1.092529,0.860596,0.0,6,2019,1,12,0,0,0,80


In [6]:
# Reorder columns
new_order = ['year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond', 'back_x','back_y','back_z','thigh_x','thigh_y','thigh_z','label']
df = df[new_order]
df.head()

Unnamed: 0,year,month,day,hour,minute,second,millisecond,back_x,back_y,back_z,thigh_x,thigh_y,thigh_z,label
0,2019,1,12,0,0,0,0,-0.306885,-0.724121,-0.303223,-4.640381,-0.52124,-1.580811,6
1,2019,1,12,0,0,0,20,-0.97998,-0.288574,-0.335693,1.295166,-1.83667,-0.389893,6
2,2019,1,12,0,0,0,40,-1.36792,0.059814,-0.255371,-0.798584,-0.893799,0.170898,6
3,2019,1,12,0,0,0,60,-0.915039,-0.089355,-0.291016,-1.01001,0.139648,0.00293,6
4,2019,1,12,0,0,0,80,-0.539795,-0.039307,-0.209717,-1.092529,0.860596,0.0,6


In [7]:
# check outliers

# Specify columns for which you want to detect outliers
columns_to_check = ['back_x','back_y','back_z','thigh_x','thigh_y','thigh_z']

def count_outliers_zscore(data, threshold=3):
    outlier_count = 0
    for column in data.columns:
        z_scores = np.abs((data[column] - data[column].mean()) / data[column].std())
        outlier_count += len(data[z_scores > threshold])
    return outlier_count



# Count outliers in specified columns
outlier_count = count_outliers_zscore(df[columns_to_check])
print("Number of outliers detected in specified columns:", outlier_count)

Number of outliers detected in specified columns: 619832


In [8]:
def remove_outliers_zscore(data, threshold=3):
    cleaned_data = data.copy()  # Create a copy of the dataframe to avoid modifying the original
    for column in columns_to_check:
        z_scores = np.abs((cleaned_data[column] - cleaned_data[column].mean()) / cleaned_data[column].std())
        cleaned_data = cleaned_data[z_scores <= threshold]
    return cleaned_data

# Remove outliers in specified columns
df_without_outliers = remove_outliers_zscore(df)

## Modeling

To train the SVM model, we'll encode the labels into 12 classes in order and then proceed to train the model in two stages:

1.**Sampling**: Given the size of our dataset, the SVM model demands substantial resources, resulting in longer training times. So we tried sampling to reduce the model running time.

1.**UnderSampling**: This step involves evaluating the SVM model's performance on a balanced dataset to gauge its effectiveness.

In [9]:
label_encoder = LabelEncoder()


In [10]:
# Splitting data into features and target variable
X = df_without_outliers[['back_x','back_y','back_z','thigh_x','thigh_y','thigh_z']]
Xt = df_without_outliers[['year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond','back_x','back_y','back_z','thigh_x','thigh_y','thigh_z']]
y = df_without_outliers['label']
y = label_encoder.fit_transform(y)
print(np.unique(y))
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split train set into train and validate sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)


[ 0  1  2  3  4  5  6  7  8  9 10 11]


### sampling

In [11]:
df_sample=df_without_outliers.sample(frac=0.1,random_state=42)

In [12]:
# Splitting data into features and target variable
X2 = df_sample[['back_x','back_y','back_z','thigh_x','thigh_y','thigh_z']]
y2 = df_sample['label']
y2 = label_encoder.fit_transform(y2)
# Split data into train and test sets
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X2, y2, test_size=0.2, random_state=42)


#### Hyper Parameter Tunning

In [None]:
# Hyper parameter tunning
# Define the parameter grid to search
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [ 1, 12 ,100],
    'decision_function_shape': ['ovo', 'ovr']
}

# Create SVM classifier
svm_classifier = SVC()

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=3, scoring='accuracy')

# Train the classifier on the undersampled data
grid_search.fit(X_train_sample, y_train_sample)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Predict on the test set using the best model
best_svm_classifier = grid_search.best_estimator_
y_pred = best_svm_classifier.predict(X_test_sample)

# Calculate accuracy
accuracy = accuracy_score(y_test_sample, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test_sample, y_pred))

# Output support vectors of the best model
print("\nSupport vectors:")
print(best_svm_classifier.support_vectors_)


#### Without Timestamp

In [None]:
# Creating SVM classifier with 'rbf' kernel and C=1.0
svm_model_s = SVC(kernel='linear', C=10, decision_function_shape='ovo')

# Training the classifier
svm_model_s.fit(X_train_sample, y_train_sample)

# Predicting on the test set
y_pred_sample = svm_model_s.predict(X_test_sample)

# Calculating accuracy
accuracy = accuracy_score(y_test_sample, y_pred_sample)
print("Accuracy:", accuracy)
print(classification_report(y_test_sample, y_pred_sample))

# Output all parameters of the SVM model
print("Parameters of the SVM model:")
print(svm_model_s.get_params())

In [None]:
# Save the model
dump(svm_model_s, 'SVM_sampeled.joblib')

#### With Timestamp

In [None]:
# Splitting data into features and target variable
X_tsample = df_sample[['year','month','day','hour','minute','second','millisecond','back_x','back_y','back_z','thigh_x','thigh_y','thigh_z']]
y_tsample = df_sample['label']
y_tsample = label_encoder.fit_transform(y_tsample)
# Split data into train and test sets
X_train_tsample, X_test_tsample, y_train_tsample, y_test_tsample = train_test_split(X_tsample, y_tsample, test_size=0.2, random_state=42)


In [None]:
# Creating SVM classifier with 'rbf' kernel and C=1.0
svm_model_ts = SVC(kernel='linear', C=1.0, decision_function_shape='ovo')

# Training the classifier
svm_model_ts.fit(X_train_tsample, y_train_tsample)

# Predicting on the test set
y_pred_tsample = svm_model_ts.predict(X_test_tsample)

# Calculating accuracy
accuracy = accuracy_score(y_test_tsample, y_pred_tsample)
print("Accuracy:", accuracy)
print(classification_report(y_test_tsample, y_pred_tsample))

# Output all parameters of the SVM model
print("Parameters of the SVM model:")
print(svm_model_ts.get_params())

### Undersampling

#### Without Timestamp

In [None]:
# Define the RandomUnderSampler
under_sampler = RandomUnderSampler()

# Apply RandomUnderSampler to your training data
X_usampled, y_usampled = under_sampler.fit_resample(X, y)
y_usampled = label_encoder.fit_transform(y_usampled)
# Split the undersampled data into train and test sets
X_train_us, X_test_us, y_train_us, y_test_us = train_test_split(X_usampled, y_usampled, test_size=0.2, random_state=42)


In [None]:
# Create SVM classifier
svm_classifier_us =SVC(kernel='rbf', C=1.0, decision_function_shape='ovo')

# Train the classifier on the undersampled data
svm_classifier_us.fit(X_train_us, y_train_us)

# Predict on the test set
y_pred = svm_classifier_us.predict(X_test_us)

# Calculate accuracy
accuracy = accuracy_score(y_test_us, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test_us, y_pred))

# Output all parameters of the SVM model
print("Parameters of the SVM model:")
print(svm_classifier_us.get_params())

# Output support vectors
print("\nSupport vectors:")
print(svm_classifier_us.support_vectors_)


#### With Timestamp

In [None]:
# Define the RandomUnderSampler
under_sampler = RandomUnderSampler()

# Apply RandomUnderSampler to your training data
X_tusampled, y_tusampled = under_sampler.fit_resample(Xt, y)
y_tusampled = label_encoder.fit_transform(y_tusampled)
# Split the undersampled data into train and test sets
X_train_tus, X_test_tus, y_train_tus, y_test_tus = train_test_split(X_tusampled, y_tusampled, test_size=0.2, random_state=42)


In [None]:
# Create SVM classifier
svm_classifier_tus =SVC(kernel='rbf', C=10, decision_function_shape='ovo')

# Train the classifier on the undersampled data
svm_classifier_tus.fit(X_train_tus, y_train_tus)

# Predict on the test set
y_pred = svm_classifier_tus.predict(X_test_tus)

# Calculate accuracy
accuracy = accuracy_score(y_test_tus, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test_tus, y_pred))

# Output all parameters of the SVM model
print("Parameters of the SVM model:")
print(svm_classifier_tus.get_params())

# Output support vectors
print("\nSupport vectors:")
print(svm_classifier_tus.support_vectors_)


In [None]:
# Save the model
dump(svm_classifier, 'SVM_all.joblib')