In [47]:
!pip install numpy pandas matplotlib seaborn scikit-learn tensorflow keras torch opencv-python labelme statsmodels scipy missingno



In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
from scipy import interpolate

import os
import warnings

In [49]:
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.float_format = '{:.7f}'.format

# **Collecting Data**

**Define columns, file to read and what class should be used for that file**
* Since my student ID ends with 9, thus I will be doing L5 (x,y,z) and T12 (x,y,z)
* Boning dataset will have a class of '0'
* Slicing dataset will have a class of '1'
* There will also be a 'Frame' column as well

In [50]:
contents_to_read = {
    'boning': {
        'fName': 'ampc2/Boning.csv',
        'class': 0
    },
    'slicing': {
        'fName': 'ampc2/Slicing.csv',
        'class': 1
    }
}
columns_to_read = [f'L5 {k}' for k in ['x', 'y', 'z']] + [f'T12 {k}' for k in ['x', 'y', 'z']] + ['Frame']

In [51]:
boning_df = pd.read_csv(contents_to_read['boning']['fName'], usecols=columns_to_read)
boning_df['class'] = contents_to_read['boning']['class']
slicing_df = pd.read_csv(contents_to_read['slicing']['fName'], usecols=columns_to_read)
slicing_df['class'] = contents_to_read['slicing']['class']

In [52]:
print(f"Shape of boning: {boning_df.shape}")
print(f"Shape of slicing: {slicing_df.shape}")

Shape of boning: (54180, 8)
Shape of slicing: (17880, 8)


Combine 2 dataset and save it

In [53]:
concatenated_df = pd.concat([boning_df, slicing_df], ignore_index=True)
concatenated_df.to_csv("new_processed_data/combined_data.csv", index=False)
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72060 entries, 0 to 72059
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Frame   72060 non-null  int64  
 1   L5 x    72060 non-null  float64
 2   L5 y    72060 non-null  float64
 3   L5 z    72060 non-null  float64
 4   T12 x   72060 non-null  float64
 5   T12 y   72060 non-null  float64
 6   T12 z   72060 non-null  float64
 7   class   72060 non-null  int64  
dtypes: float64(6), int64(2)
memory usage: 4.4 MB


# **Creating Composite Columns**

In [54]:
concatenated_df.head()

Unnamed: 0,Frame,L5 x,L5 y,L5 z,T12 x,T12 y,T12 z,class
0,0,0.0526543,0.039386,-0.0770018,0.0994581,0.0743958,-0.1454479,0
1,1,-0.0535254,0.1172787,0.1502453,-0.1035941,0.2266579,0.2843477,0
2,2,0.0739294,-0.0223806,0.0327013,0.1371887,-0.0367908,0.0617175,0
3,3,-0.0372945,-0.0099753,0.0158462,-0.0281503,0.017239,0.0612578,0
4,4,0.0917448,-0.0144044,0.0541684,0.1729826,-0.0230234,0.1022738,0


Calculating Root-Mean-Square value, Roll and Pitch

In [55]:
def calc_rmsq_for_cols(df: pd.DataFrame, cols: list[str]):
  return np.sqrt(np.mean(df[cols] ** 2, axis=1))

def calc_roll_for_col(df: pd.DataFrame, col: str):
  return 180 * np.arctan2(df[f'{col} y'], np.sqrt(df[f'{col} x'] ** 2 + df[f'{col} z'] ** 2)) / np.pi

def calc_pitch_for_col(df: pd.DataFrame, col: str):
  return 180 * np.arctan2(df[f'{col} x'], np.sqrt(df[f'{col} y'] ** 2 + df[f'{col} z'] ** 2)) / np.pi

Calculate L5 composite features

In [56]:
concatenated_df['l5_xy_rmsq'] = calc_rmsq_for_cols(concatenated_df, ['L5 x', 'L5 y'])
concatenated_df['l5_yz_rmsq'] = calc_rmsq_for_cols(concatenated_df, ['L5 y', 'L5 z'])
concatenated_df['l5_xz_rmsq'] = calc_rmsq_for_cols(concatenated_df, ['L5 x', 'L5 z'])
concatenated_df['l5_xyz_rmsq'] = calc_rmsq_for_cols(concatenated_df, ['L5 x', 'L5 y', 'L5 z'])

concatenated_df['l5_roll'] = calc_roll_for_col(concatenated_df, 'L5')
concatenated_df['l5_pitch'] = calc_pitch_for_col(concatenated_df, 'L5')

Calculate T12 composite features

In [57]:
concatenated_df['t12_xy_rmsq'] = calc_rmsq_for_cols(concatenated_df, ['T12 x', 'T12 y'])
concatenated_df['t12_yz_rmsq'] = calc_rmsq_for_cols(concatenated_df, ['T12 y', 'T12 z'])
concatenated_df['t12_xz_rmsq'] = calc_rmsq_for_cols(concatenated_df, ['T12 x', 'T12 z'])
concatenated_df['t12_xyz_rmsq'] = calc_rmsq_for_cols(concatenated_df, ['T12 x', 'T12 y', 'T12 z'])

concatenated_df['t12_roll'] = calc_roll_for_col(concatenated_df, 'T12')
concatenated_df['t12_pitch'] = calc_pitch_for_col(concatenated_df, 'T12')

In [58]:
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72060 entries, 0 to 72059
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Frame         72060 non-null  int64  
 1   L5 x          72060 non-null  float64
 2   L5 y          72060 non-null  float64
 3   L5 z          72060 non-null  float64
 4   T12 x         72060 non-null  float64
 5   T12 y         72060 non-null  float64
 6   T12 z         72060 non-null  float64
 7   class         72060 non-null  int64  
 8   l5_xy_rmsq    72060 non-null  float64
 9   l5_yz_rmsq    72060 non-null  float64
 10  l5_xz_rmsq    72060 non-null  float64
 11  l5_xyz_rmsq   72060 non-null  float64
 12  l5_roll       72060 non-null  float64
 13  l5_pitch      72060 non-null  float64
 14  t12_xy_rmsq   72060 non-null  float64
 15  t12_yz_rmsq   72060 non-null  float64
 16  t12_xz_rmsq   72060 non-null  float64
 17  t12_xyz_rmsq  72060 non-null  float64
 18  t12_roll      72060 non-nu

In [59]:
concatenated_df.to_csv("new_processed_data/composited_data.csv", index=False)
concatenated_df.head()

Unnamed: 0,Frame,L5 x,L5 y,L5 z,T12 x,T12 y,T12 z,class,l5_xy_rmsq,l5_yz_rmsq,l5_xz_rmsq,l5_xyz_rmsq,l5_roll,l5_pitch,t12_xy_rmsq,t12_yz_rmsq,t12_xz_rmsq,t12_xyz_rmsq,t12_roll,t12_pitch
0,0,0.0526543,0.039386,-0.0770018,0.0994581,0.0743958,-0.1454479,0,0.0464959,0.0611577,0.0659612,0.0584608,22.8904439,31.3326261,0.0878255,0.1155202,0.1245933,0.110426,22.8904439,31.3326265
1,1,-0.0535254,0.1172787,0.1502453,-0.1035941,0.2266579,0.2843477,0,0.0911572,0.1347738,0.1127799,0.1142992,36.3275373,-15.6862032,0.176218,0.2571258,0.2139922,0.2182958,36.8316995,-15.9016284
2,2,0.0739294,-0.0223806,0.0327013,0.1371887,-0.0367908,0.0617175,0,0.0546189,0.0280202,0.0571618,0.048428,-15.4749671,61.8084519,0.1004348,0.0508065,0.1063715,0.0894117,-13.7429105,62.3571236
3,3,-0.0372945,-0.0099753,0.0158462,-0.0281503,0.017239,0.0612578,0,0.0272982,0.0132402,0.0286529,0.0240935,-13.8296942,-63.3400863,0.0233412,0.0449984,0.0476706,0.0401752,14.3437502,-23.8624959
4,4,0.0917448,-0.0144044,0.0541684,0.1729826,-0.0230234,0.1022738,0,0.0656681,0.039634,0.075337,0.0620721,-7.6996265,58.5774103,0.1233958,0.0741283,0.1420966,0.1167804,-6.535875,58.7827535


# **Data pre-processing and Feature computation**

In [60]:
composited_df = concatenated_df.copy()

**Create statistical features for the 18 columns per minute (1 min = 60fpm)**

These includes:

* Mean
* Standard deviation
* Min
* Max
* Area under the curve (AUC)
* Number of peaks

In [61]:
new_cols = {}
FPM = 60
num_of_min = len(composited_df) // FPM
print(num_of_min)

for column in concatenated_df.columns:
  if column not in ['Frame', 'class']:
    values = {
      'mean': [],
      'max': [],
      'min': [],
      'std': [],
      'auc': [],
      'peak': []
    }
    for i in range(num_of_min):
      start, end = i * FPM, (i + 1) * FPM
      values['mean'].append(np.mean(concatenated_df[column][start:end]))
      values['max'].append(np.max(concatenated_df[column][start:end]))
      values['min'].append(np.min(concatenated_df[column][start:end]))
      values['std'].append(np.std(concatenated_df[column][start:end]))
      values['auc'].append(np.trapz(concatenated_df[column][start:end]))
      peaks, _ = find_peaks(concatenated_df[column][start:end])
      values['peak'].append(len(peaks))
    new_cols[f'{column}_mean'] = values['mean']
    new_cols[f'{column}_max'] = values['max']
    new_cols[f'{column}_min'] = values['min']
    new_cols[f'{column}_std'] = values['std']
    new_cols[f'{column}_auc'] = values['auc']
    new_cols[f'{column}_peak'] = values['peak']

# print(new_cols)

new_features_df = pd.DataFrame(new_cols)
new_features_df['class'] = composited_df['class'][::FPM].reset_index(drop=True)
new_features_df["Minute"] = range(1, num_of_min + 1)

1201


In [62]:
new_features_df.to_csv("new_processed_data/full_processed_data.csv", index=False)

In [63]:
print("Shape: ", new_features_df.shape)
print("Info: ", new_features_df.info())

Shape:  (1201, 110)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1201 entries, 0 to 1200
Columns: 110 entries, L5 x_mean to Minute
dtypes: float64(90), int64(20)
memory usage: 1.0 MB
Info:  None


# **Training**

In [64]:
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [65]:
all_data_df = new_features_df.copy()

In [66]:
print(all_data_df.head())
print("Info: ", all_data_df.info())

   L5 x_mean  L5 x_max   L5 x_min  L5 x_std   L5 x_auc  L5 x_peak  L5 y_mean  \
0  0.0114493 0.3751575 -0.3151581 0.1327415  0.6096795         16 -0.0122954   
1 -0.0315213 1.3709298 -0.8050507 0.3482058 -1.5056593         16  0.0111132   
2  0.0224071 1.2923841 -1.3034076 0.6003772  1.5930189         18  0.0255545   
3  0.0350186 1.3752680 -1.1076449 0.5635564  1.9601368         13 -0.1378316   
4 -0.0722189 1.8447772 -1.8701316 0.6936417 -3.9543328         14  0.0276975   

   L5 y_max   L5 y_min  L5 y_std   L5 y_auc  L5 y_peak  L5 z_mean  L5 z_max  \
0 0.3200817 -0.2765721 0.1208335 -0.7624985         17  0.0059038 0.2449787   
1 0.4042456 -0.5005891 0.1996808  0.6701410         15  0.0596023 1.4091193   
2 0.8720929 -0.8778820 0.4277349  2.1984654         16 -0.0528891 0.4049394   
3 1.0463692 -1.1207336 0.4760387 -8.7487254         17 -0.0053414 1.0672191   
4 3.1460309 -2.4863657 0.9055351  1.2359550         15  0.0397556 1.5277607   

    L5 z_min  L5 z_std   L5 z_auc  L5 z_peak

test split, with 30% of test data

In [67]:
X_vals = all_data_df.drop(['class', 'Minute'], axis=1)
Y_vals = all_data_df['class']

X_train, X_test, Y_train, Y_test = train_test_split(X_vals, Y_vals, test_size=0.3, random_state=42)

Predict with Support Vector Machine

In [68]:
svc = svm.SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc = accuracy_score(Y_test, Y_pred)

print(f"Accuracy of the SVM is: {acc * 100:2f}")

Accuracy of the SVM is: 77.562327


10-fold cross validation mean accuracy of SVM

In [69]:
svc = svm.SVC()
cross_val = cross_val_score(svc, X_vals, Y_vals, cv = 10)
print(f"10-fold cross validation mean accuracy score: {cross_val.mean()*100:2f}")

10-fold cross validation mean accuracy score: 75.020661


Find the best set of values for the model using GridSearchCV

In [70]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, Y_train)

print("Best params to fit: ", grid.best_params_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.744 total time=   0.1s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.744 total time=   0.1s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.744 total time=   0.1s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.744 total time=   0.1s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.738 total time=   0.1s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.744 total time=   0.1s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.744 total time=   0.1s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.744 total time=   0.1s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.744 total time=   0.1s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.738 total time=   0.1s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.744 total time=   0.1s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

SVM training and predicting with hyperparameter tuning

In [71]:
svc_with_hyp = svm.SVC(C=grid.best_params_['C'], gamma=grid.best_params_['gamma'], kernel=grid.best_params_['kernel'])
svc_with_hyp.fit(X_train, Y_train)

y_pred_with_hyp = svc_with_hyp.predict(X_test)

accuracy_score_with_hyp = accuracy_score(Y_test, y_pred_with_hyp)

print(f"Accuracy of the SVM with hyperparameters tuning: {accuracy_score_with_hyp * 100:2f}")

Accuracy of the SVM with hyperparameters tuning: 77.285319


10-fold cross validation mean accuracy of SVM with hyperparameter tuning

In [72]:
cv_scores_with_hyp = cross_val_score(svc_with_hyp, X_vals, Y_vals, cv = 10)
print(f"10-fold cross validation mean accuracy score with hyperparameter tuning: {cv_scores_with_hyp.mean()*100:2f}")


10-fold cross validation mean accuracy score with hyperparameter tuning: 75.187328


Select features and split based on the selected features using SelectKBest

In [73]:
selector = SelectKBest(f_classif, k=100)
X_selected = selector.fit_transform(X_vals, Y_vals)

X_train, X_test, Y_train, Y_test = train_test_split(X_selected, Y_vals, test_size=0.3, random_state=42)

SVM training and predicting with feature selection + hyperparameter tuning

In [74]:
svc_with_hyp.fit(X_train, Y_train)
y_pred_hyp_selected = svc_with_hyp.predict(X_test)
accuracy_score_with_hyp_selected = accuracy_score(Y_test, y_pred_hyp_selected)
print(f"Accuracy with hyperparameter + selected feature:  {accuracy_score_with_hyp_selected * 100:2f}")

Accuracy with hyperparameter + selected feature:  77.285319


10-fold cross validation mean accuracy of SVM with hyperparameter tuning + features selection

In [75]:
cv_scores_with_hyp_selected = cross_val_score(svc_with_hyp, X_selected, Y_vals, cv = 10)
print(f"10-fold cross validation accuracy with hyperparameter + selected feature:  {cv_scores_with_hyp_selected.mean() * 100:2f}")


10-fold cross validation accuracy with hyperparameter + selected feature:  75.187328


Perform PCA to reduce dimensionality

In [76]:
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_vals)

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y_vals, test_size=0.2, random_state=1)

SVM training and predicting with PCA + hyperparameter tuning

In [77]:
svc_with_hyp.fit(X_train, Y_train)
y_pred_hyp_pca = svc_with_hyp.predict(X_test)
accuracy_score_with_hyp_pca = accuracy_score(Y_test, y_pred_hyp_pca)
print(f"Accuracy of the model after PCA: {accuracy_score_with_hyp_pca*100:2f}")

Accuracy of the model after PCA: 79.253112


10-fold cross validation mean accuracy of SVM with hyperparameter tuning + PCA

In [78]:
cv_score_with_hyp_pca = cross_val_score(svc_with_hyp, X_pca, Y_vals, cv = 10)
print(f"10-fold cross validation accuracy after PCA: {cv_score_with_hyp_pca.mean() *100:2f}")

10-fold cross validation accuracy after PCA: 75.187328


# **Model selection**

* SGD
* RandomForest
* MLPClassifier

In [79]:
X_data = all_data_df.drop(['class', 'Minute'], axis=1)
Y_data = all_data_df['class']

X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.3, random_state=42)

SGD implementation

In [80]:
sgd = SGDClassifier(random_state=42)
sgd.fit(X_train, Y_train)
y_pred_sgd = sgd.predict(X_test)
accuracy_score_sgd = accuracy_score(Y_test, y_pred_sgd)
print(f"Accuracy of the SGD model: {accuracy_score_sgd * 100:2f}")

Accuracy of the SGD model: 82.825485


In [81]:
cv_score_sgd = cross_val_score(sgd, X_data, Y_data, cv=10)
cv_score_sgd_mean = cv_score_sgd.mean()
print(f"10-fold cross validation accuracy of the SGD model: {cv_score_sgd_mean * 100:2f}")

10-fold cross validation accuracy of the SGD model: 72.770661


RandomForest implementation

In [82]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, Y_train)
y_pred_rf = rf.predict(X_test)
accuracy_score_rf = accuracy_score(Y_test, y_pred_rf)
print(f"Accuracy of the Random Forest model: {accuracy_score_rf * 100:2f}")

Accuracy of the Random Forest model: 86.703601


In [83]:
cv_score_rf = cross_val_score(rf, X_data, Y_data, cv=10)
cv_score_rf_mean = cv_score_rf.mean()
print(f"10-fold cross validation accuracy of the Random Forest model: {cv_score_rf_mean * 100:2f}")

10-fold cross validation accuracy of the Random Forest model: 84.517218


MLPClassifier Implemetation

In [84]:
mlp = MLPClassifier(random_state=42)
mlp.fit(X_train, Y_train)
y_pred_mlp = mlp.predict(X_test)
accuracy_score_mlp = accuracy_score(Y_test, y_pred_mlp)
print(f"Accuracy of the MLP model: {accuracy_score_mlp * 100:2f}")

Accuracy of the MLP model: 81.440443


In [85]:
cv_score_mlp = cross_val_score(mlp, X_data, Y_data, cv=10)
cv_score_mlp_mean = cv_score_mlp.mean()
print(f"10-fold cross validation accuracy of the MLP model: {cv_score_mlp_mean * 100:2f}")

10-fold cross validation accuracy of the MLP model: 76.604683
