# PathPilot ML training
##### Author: [Joseph Selva Raj]

In [17]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from micromlgen import port

### Loading the data

Load the LIDAR measurment data from txt file

In [18]:
data = pd.read_csv('C:\\Users\\josep\\Documents\\Github Repo\\PathPilot\\PathPilot\\DATARIGHT1.TXT', header=None)
print(data.head())

   0    1    2    3    4    5    6    7    8    9    ...  231  232  233  234  \
0    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
1    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
2    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
3    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   
4    0    0    0    0    0    0    0    0    0    0  ...    0    0    0    0   

   235  236  237  238  239  240  
0    0    0    0    0    0    D  
1    0    0    0    0    0    D  
2    0    0    0    0    0    D  
3    0    0    0    0    0    D  
4    0    0    0    0    0    D  

[5 rows x 241 columns]


### Data cleaning

Rename the last column as "label" and clean the data by eliminating all data strings that are not annotated with "Forward" command labels.

The processed data should only contain the LIDAR measurements and the corresponding command labels:
- F - forward
- R - forward right
- L - forward left

In [19]:
data.rename(columns={data.columns[-1]: 'Label'}, inplace=True)
print(f"Label counts before cleaning the data: \n {data['Label'].value_counts()}")
data = data[(data['Label'] != 'd') & (data['Label'] != 'b') & (data['Label'] != 's') & (data['Label'] != 'l')& (data['Label'] != 'H')& (data['Label'] != 'D')& (data['Label'] != 'r')& (data['Label'] != 'm')& (data['Label'] != 'n')]
data.reset_index(drop=True, inplace=True)
print(f"Label counts after cleaning the data: \n {data['Label'].value_counts()}")

Label counts before cleaning the data: 
 Label
F    2401
R    2023
L     567
s     313
D     107
b      10
l       1
Name: count, dtype: int64
Label counts after cleaning the data: 
 Label
F    2401
R    2023
L     567
Name: count, dtype: int64


### Spilt data into train and test sets
Separate X and Y as the input and output data and divide them into train and test sets with train_test_split. 
Label encoder is used to convert the labels from character to number format to interface with the classifier.

In [20]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"Label encoding mapping for motor control in Arduino code: {label_mapping}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Label encoding mapping for motor control in Arduino code: {'F': 0, 'L': 1, 'R': 2}


### Data selection

Data selection is performed after the splitting to train and test sets, otherwise it may be exposed to data leakage problem. Data selection is performed with SelectKBest from sklearn package. The number of selected features is defined as K. The selected features are the Lidar data measurement angles that are most important for the classification, and have to be used in the Arduino code for the real-time classification.

In [21]:
k = 80
k_best = SelectKBest(score_func=f_classif, k=k)
k_best.fit(X_train, y_train)

selected_feature_indices = k_best.get_support(indices=True)
print("Format of selected features (Angles) to be copied into Arduino code:\n", X.columns[selected_feature_indices])

Format of selected features (Angles) to be copied into Arduino code:
 Index([ 10,  14,  30,  32,  34,  35,  36,  38,  60,  62,  64,  66,  68,  69,
        70,  71,  72,  73,  74,  75,  76,  77,  78,  79, 116, 120, 122, 128,
       134, 144, 146, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
       173, 174, 176, 178, 180, 182, 184, 186, 198, 200, 201, 202, 203, 204,
       205, 206, 207, 208, 209, 210, 211, 212, 213, 214],
      dtype='object')


### Training the model
Training the model is a straightforward process, thanks to all the libraries available in Python. The outcome of the training process depends on the dataset and the preceding steps. Post-training, accuracy will be computed using the test set, and a higher accuracy is desirable.

In [22]:
clf = RandomForestClassifier(max_depth=3, random_state=42)
clf.fit(X_train.iloc[:, selected_feature_indices], y_train)

y_pred = clf.predict(X_test.iloc[:, selected_feature_indices])

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

class_names = label_encoder.classes_
report = classification_report(y_test, y_pred, target_names=class_names, zero_division=0)
print('Classification Report:\n', report)

Accuracy: 0.7787787787787788
Classification Report:
               precision    recall  f1-score   support

           F       0.78      0.77      0.78       496
           L       0.87      0.52      0.65       100
           R       0.77      0.85      0.80       403

    accuracy                           0.78       999
   macro avg       0.80      0.71      0.74       999
weighted avg       0.78      0.78      0.78       999



### Exporting the Classifier

Export the trained classifier to a C code header file via the micromlgen library, so that it can be used in the Arduino code for real-time classification.


In [23]:
arduino_code = open("randomForest.h", mode="w+")
arduino_code.write(port(clf))
arduino_code.close()
print("Import selected features to Arduino code:\n", X.columns[selected_feature_indices])

Import selected features to Arduino code:
 Index([ 10,  14,  30,  32,  34,  35,  36,  38,  60,  62,  64,  66,  68,  69,
        70,  71,  72,  73,  74,  75,  76,  77,  78,  79, 116, 120, 122, 128,
       134, 144, 146, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
       173, 174, 176, 178, 180, 182, 184, 186, 198, 200, 201, 202, 203, 204,
       205, 206, 207, 208, 209, 210, 211, 212, 213, 214],
      dtype='object')
