# PathPilot ML training
##### Author: [Joseph Selva Raj]

In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from micromlgen import port

### Loading the data

Load the LIDAR measurement data from txt file

In [7]:
data = pd.read_csv('C:\\Users\\josep\\Documents\\Github Repo\\PathPilot\\PathPilot\\all.TXT', header=None)
print(data.head())

     0       1       2       3       4       5       6      7       8    \
0  466.0  468.50  471.00  473.75  480.25  483.50  487.25  492.0  496.50   
1  450.0  451.75  456.50  460.25  463.50  467.75  477.50  481.5  486.75   
2  383.0  385.50  391.50  452.25  398.25  402.00  462.25  466.5  447.00   
3  372.5  374.50  380.75  452.25  386.25  389.50  397.25  466.5  406.25   
4  342.5  344.50  348.50  452.25  353.75  356.25  359.00  362.0  365.25   

      9    ...     231     232    233     234     235     236     237     238  \
0  501.75  ...  457.00  457.00  457.0  457.00  457.75  458.50  459.25  462.25   
1  492.00  ...  381.50  387.25  390.5  394.00  397.75  398.75  398.75  442.75   
2  449.50  ...  366.75  367.25  368.0  370.25  397.75  372.75  374.25  378.00   
3  447.25  ...  360.00  360.00  368.0  361.25  362.75  365.00  374.25  368.50   
4  369.25  ...  360.00  334.00  334.0  334.75  335.25  335.75  337.00  339.50   

      239  240  
0  463.50    F  
1  444.75    l  
2  444.75  

### Data cleaning

Rename the last column as "label" and clean the data by eliminating all data strings that are not annotated with "Forward" command labels.

The processed data should only contain the LIDAR measurements and the corresponding command labels:
- F - forward
- R - forward right
- L - forward left

In [8]:
data.rename(columns={data.columns[-1]: 'Label'}, inplace=True)
print(f"Label counts before cleaning the data: \n {data['Label'].value_counts()}")
data = data[(data['Label'] != 'd') & (data['Label'] != 'b') & (data['Label'] != 's') & (data['Label'] != 'l')& (data['Label'] != 'H')& (data['Label'] != 'D')& (data['Label'] != 'r')& (data['Label'] != 'm')& (data['Label'] != 'n')]
data.reset_index(drop=True, inplace=True)
print(f"Label counts after cleaning the data: \n {data['Label'].value_counts()}")

Label counts before cleaning the data: 
 Label
F    26256
L    12544
R     7411
l      410
r      242
H       19
s        2
Name: count, dtype: int64
Label counts after cleaning the data: 
 Label
F    26256
L    12544
R     7411
Name: count, dtype: int64


### Spilt data into train and test sets
Separate X and Y as the input and output data and divide them into train and test sets with train_test_split. 
Label encoder is used to convert the labels from character to number format to interface with the classifier.

In [9]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"Label encoding mapping for motor control in Arduino code: {label_mapping}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Label encoding mapping for motor control in Arduino code: {'F': 0, 'L': 1, 'R': 2}


### Training the model
Training the model is a straightforward process, thanks to all the libraries available in Python. The outcome of the training process depends on the dataset and the preceding steps. Post-training, accuracy will be computed using the test set, and a higher accuracy is desirable.

In [10]:
clf = RandomForestClassifier(max_depth=3, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

class_names = label_encoder.classes_
report = classification_report(y_test, y_pred, target_names=class_names, zero_division=0)
print('Classification Report:\n', report)

Accuracy: 0.6668830466298821
Classification Report:
               precision    recall  f1-score   support

           F       0.65      0.90      0.76      5231
           L       0.72      0.43      0.54      2534
           R       0.73      0.24      0.36      1478

    accuracy                           0.67      9243
   macro avg       0.70      0.52      0.55      9243
weighted avg       0.68      0.67      0.63      9243



### Exporting the Classifier

Export the trained classifier to a C code header file via the micromlgen library, so that it can be used in the Arduino code for real-time classification.
