# PathPilot ML training
##### Author: [Joseph Selva Raj]

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from micromlgen import port

### Loading the data

Load the LIDAR measurement data from txt file

In [2]:
data = pd.read_csv('C:\\Users\\josep\\Documents\\Github Repo\\PathPilot\\PathPilot\\MASTERDATA240CIRCLE.TXT', header=None)
print(data.head())

   0    1    2    3    4    5    6    7    8    9    ...  231  232  233  234  \
0  221  224  225  227  229  230  232  234  236  239  ...  211  212  212  212   
1  218  221  222  222  225  227  229  231  233  235  ...  211  211  216  217   
2  212  218  215  215  217  218  220  222  224  224  ...  202  202  202  203   
3  193  196  197  199  200  202  204  205  207  209  ...  183  183  184  184   
4  185  187  188  190  191  193  195  197  197  201  ...  202  195  192  189   

   235  236  237  238  239  240  
0  212  213  214  214  215    F  
1  210  211  211  212  213    F  
2  203  204  204  204  205    F  
3  185  185  186  187  187    F  
4  187  184  182  179  177    R  

[5 rows x 241 columns]


### Data cleaning

Rename the last column as "label" and clean the data by eliminating all data strings that are not annotated with "Forward" command labels.

The processed data should only contain the LIDAR measurements and the corresponding command labels:
- F - forward
- R - forward right
- L - forward left

In [3]:
data.rename(columns={data.columns[-1]: 'Label'}, inplace=True)
print(f"Label counts before cleaning the data: \n {data['Label'].value_counts()}")
data = data[(data['Label'] != 'd') & (data['Label'] != 'b') & (data['Label'] != 's') & (data['Label'] != 'l')& (data['Label'] != 'H')& (data['Label'] != 'D')& (data['Label'] != 'r')& (data['Label'] != 'm')& (data['Label'] != 'n')]
data.reset_index(drop=True, inplace=True)
print(f"Label counts after cleaning the data: \n {data['Label'].value_counts()}")

Label counts before cleaning the data: 
 Label
F    17384
R     7946
L     4937
s      425
D      212
r       86
l       71
b       20
H        2
Name: count, dtype: int64
Label counts after cleaning the data: 
 Label
F    17384
R     7946
L     4937
Name: count, dtype: int64


### Spilt data into train and test sets
Separate X and Y as the input and output data and divide them into train and test sets with train_test_split. 
Label encoder is used to convert the labels from character to number format to interface with the classifier.

In [4]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"Label encoding mapping for motor control in Arduino code: {label_mapping}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Label encoding mapping for motor control in Arduino code: {'F': 0, 'L': 1, 'R': 2}


### Data selection

Data selection is performed after the splitting to train and test sets, otherwise it may be exposed to data leakage problem. Data selection is performed with SelectKBest from sklearn package. The number of selected features is defined as K. The selected features are the Lidar data measurement angles that are most important for the classification, and have to be used in the Arduino code for the real-time classification.

In [5]:
k = 80
k_best = SelectKBest(score_func=f_classif, k=k)
k_best.fit(X_train, y_train)

selected_feature_indices = k_best.get_support(indices=True)
print("Format of selected features (Angles) to be copied into Arduino code:\n", X.columns[selected_feature_indices])

Format of selected features (Angles) to be copied into Arduino code:
 Index([ 66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
       135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
       149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162,
       163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176,
       177, 178, 179, 180, 181, 182, 184, 186, 200, 202, 204, 205, 206, 207,
       208, 209, 210, 211, 212, 213, 214, 215, 216, 218],
      dtype='object')


### Training the model
Training the model is a straightforward process, thanks to all the libraries available in Python. The outcome of the training process depends on the dataset and the preceding steps. Post-training, accuracy will be computed using the test set, and a higher accuracy is desirable.

In [16]:
clf = RandomForestClassifier(max_depth=6, random_state=42)
clf.fit(X_train.iloc[:, selected_feature_indices], y_train)

y_pred = clf.predict(X_test.iloc[:, selected_feature_indices])

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

class_names = label_encoder.classes_
report = classification_report(y_test, y_pred, target_names=class_names, zero_division=0)
print('Classification Report:\n', report)

Accuracy: 76.79%
Classification Report:
               precision    recall  f1-score   support

           F       0.76      0.86      0.81      3425
           L       0.73      0.54      0.62      1024
           R       0.81      0.72      0.76      1605

    accuracy                           0.77      6054
   macro avg       0.77      0.71      0.73      6054
weighted avg       0.77      0.77      0.76      6054



### Exporting the Classifier

Export the trained classifier to a C code header file via the micromlgen library, so that it can be used in the Arduino code for real-time classification.
Write about the relationship between the sketch size & the accuracy. IDE will show the size.

In [17]:
arduino_code = open("randomForest.h", mode="w+")
arduino_code.write(port(clf))
arduino_code.close()
print("Import selected features to Arduino code:\n", X.columns[selected_feature_indices])

Import selected features to Arduino code:
 Index([ 66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
       135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
       149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162,
       163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176,
       177, 178, 179, 180, 181, 182, 184, 186, 200, 202, 204, 205, 206, 207,
       208, 209, 210, 211, 212, 213, 214, 215, 216, 218],
      dtype='object')
