<a href="https://colab.research.google.com/github/Natalkina/DataScience/blob/main/HW5_SVM_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import zipfile
from google.colab import drive
import os


drive.mount('/content/drive', force_remount=True)

zip_path = '/content/drive/MyDrive/GOIT/data_hw5.zip'
extracted_path = '/content/extracted_data'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)


Mounted at /content/drive


In [37]:
def def_dataframe_stat(csv_path):
    df = pd.read_csv(csv_path)
    stats = df.agg(['min', 'max', 'mean', 'std']).stack()
    stats.index = [f"{col}_{stat}" for col, stat in stats.index]
    return stats.to_frame().T
activity_folders = ['walking', 'idle', 'running', 'stairs']
combined_dfs = []

for activity in activity_folders:
    folder_path = f'extracted_data/data/{activity}'
    csv_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]
    dfs = []

    for csv_path in csv_paths:
        result_df = def_dataframe_stat(csv_path)
        dfs.append(result_df)

    combined_df = pd.concat(dfs, axis=0, ignore_index=True)
    combined_df['activity'] = activity
    combined_dfs.append(combined_df)
final_combined_df = pd.concat(combined_dfs, axis=0, ignore_index=True)

final_combined_df.head(10)

Unnamed: 0,min_accelerometer_X,min_accelerometer_Y,min_accelerometer_Z,max_accelerometer_X,max_accelerometer_Y,max_accelerometer_Z,mean_accelerometer_X,mean_accelerometer_Y,mean_accelerometer_Z,std_accelerometer_X,std_accelerometer_Y,std_accelerometer_Z,activity
0,-7.445967,-28.495789,-26.532543,7.546524,5.272033,9.107543,0.236388,-9.757968,-0.525607,4.062555,6.647374,6.78124,walking
1,-16.333244,-21.35628,-16.127342,0.766145,4.783615,18.679562,-4.333026,-9.368032,-2.459005,4.449873,5.53451,6.185075,walking
2,-9.801862,-16.25184,-10.271125,1.982399,-2.461239,23.467964,-3.638708,-9.450872,-0.654255,3.134902,3.85784,5.816441,walking
3,-14.537593,-20.254946,-19.546263,3.323152,6.191406,32.3313,-3.713726,-9.123983,-1.98192,5.00035,6.332666,8.938634,walking
4,-14.84405,-17.506401,-19.129671,4.223372,0.842759,20.135237,-3.934152,-9.681034,-1.664928,3.972373,4.990199,6.443429,walking
5,-9.849746,-17.0946,-6.574477,0.727837,6.713342,11.128249,-3.260264,-7.202397,-0.472456,2.473124,6.371377,3.392147,walking
6,-13.268666,-19.139248,-19.182344,1.35033,6.502652,5.147534,-4.022578,-9.266678,-2.618459,3.923001,5.873142,5.076592,walking
7,-7.101202,-19.359514,-8.772355,6.727707,13.891158,12.344504,0.781627,-8.497022,-1.670833,3.49064,6.436539,4.574111,walking
8,-11.650186,-26.695349,-11.650186,2.312799,0.234632,12.411542,-2.93593,-11.292172,-1.20572,3.487147,6.614729,5.1702,walking
9,-11.75553,-19.378668,-19.038692,16.132132,-3.907337,26.743233,2.47321,-11.117396,0.075976,5.295236,5.014467,8.091427,walking


In [38]:
final_combined_df.shape

(6462, 13)

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

In [46]:
X = final_combined_df.drop('activity', axis=1)
y = final_combined_df['activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [50]:
# Using method SVM
svm_mod = svm.SVC()
svm_mod.fit(X_train, y_train)
svm_pred = svm_mod.predict(X_test)
print(f"SVM method result: \n {classification_report(y_test, svm_pred)}")

SVM method result: 
               precision    recall  f1-score   support

        idle       1.00      1.00      1.00       205
     running       1.00      1.00      1.00       700
      stairs       0.74      0.61      0.67        28
     walking       0.97      0.98      0.98       360

    accuracy                           0.99      1293
   macro avg       0.93      0.90      0.91      1293
weighted avg       0.99      0.99      0.99      1293



In [55]:
# Define best parameters for SVM method
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': [1, 0.1, 0.01, 0.001],
}

# Create a SVM Classifier
svm_classifier = svm.SVC()

# Perform GridSearch to find the best hyperparameters
grid_search = GridSearchCV(svm_classifier, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_params

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [56]:
# Changing our model according to the best params
svm_mod = svm.SVC(C=100, kernel='rbf', gamma=0.01)
svm_mod.fit(X_train, y_train)
svm_pred = svm_mod.predict(X_test)
print(f"SVM method result after search of best param: \n {classification_report(y_test, svm_pred)}")

SVM method result after search of best param: 
               precision    recall  f1-score   support

        idle       1.00      1.00      1.00       205
     running       1.00      1.00      1.00       700
      stairs       1.00      1.00      1.00        28
     walking       1.00      1.00      1.00       360

    accuracy                           1.00      1293
   macro avg       1.00      1.00      1.00      1293
weighted avg       1.00      1.00      1.00      1293



In [52]:
# Using method RF
rf_mod = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rf_mod.fit(X_train, y_train)
rf_pred = rf_mod.predict(X_test)
print(f"RandomForest method result: \n {classification_report(y_test, rf_pred)}")

RandomForest method result: 
               precision    recall  f1-score   support

        idle       1.00      1.00      1.00       205
     running       1.00      1.00      1.00       700
      stairs       1.00      0.93      0.96        28
     walking       0.99      1.00      1.00       360

    accuracy                           1.00      1293
   macro avg       1.00      0.98      0.99      1293
weighted avg       1.00      1.00      1.00      1293



In [54]:
# Define best parameters for RandomForest method
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 5, 10, 15],
}

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=0)

# Perform GridSearch to find the best hyperparameters
grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_params

{'max_depth': None, 'n_estimators': 150}

In [57]:
# Changing our model according to the best params
rf_mod = RandomForestClassifier(n_estimators=150, random_state=0)
rf_mod.fit(X_train, y_train)
rf_pred = rf_mod.predict(X_test)
print(f"RandomForest method result after searching best param: \n {classification_report(y_test, rf_pred)}")

RandomForest method result after searching best param: 
               precision    recall  f1-score   support

        idle       1.00      1.00      1.00       205
     running       1.00      1.00      1.00       700
      stairs       1.00      1.00      1.00        28
     walking       1.00      1.00      1.00       360

    accuracy                           1.00      1293
   macro avg       1.00      1.00      1.00      1293
weighted avg       1.00      1.00      1.00      1293



Як бачимо після покращення параметрів обидва методи показують відмінний результат