MACHINE LEARNING FOR VEHICLE MODEL
# Author: [Mustafa Oudah Hani ALSAEDI]

.

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from micromlgen import port

### Loading the data

Then we can load the data from txt file: 

 There are only five possible letters:
- F - forward
- I - forward right
- R - right
- G - forward left
- L - left

In [19]:
data = pd.read_csv('D:\github\Machine-Learning-Robot\data\data1.txt', header=None)
print(data.head())

   0    1    2    3    4    5    6    7    8    9    ...  231  232  233  234  \
0  235  237  239  241  243  246  248  251  253  256  ...  222  222  223  223   
1  299  308  316  320  327  334  342  350  359  368  ...  248  251  254  257   
2  294  302  308  317  322  329  336  344  353  362  ...  243  246  249  252   
3  218  225  230  234  239  244  244  255  261  267  ...  182  184  187  189   
4  188  194  172  170  172  176  179  183  187  192  ...  182  184  187  167   

   235  236  237  238  239  240  
0  224  225  226  227  228    R  
1  260  264  267  271  275    F  
2  255  258  262  266  270    F  
3  191  194  196  199  202    F  
4  165  167  170  172  174    G  

[5 rows x 241 columns]


### Data cleaning
.

In [20]:
data.rename(columns={data.columns[-1]: 'Label'}, inplace=True)
print(f"Label counts before cleaning the data: \n {data['Label'].value_counts()}")
data = data[(data['Label'] != 'L') & (data['Label'] != 'R') & (data['Label'] != 'H') & (data['Label'] != 'J')]
data.reset_index(drop=True, inplace=True)
print(f"Label counts after cleaning the data: \n {data['Label'].value_counts()}")

Label counts before cleaning the data: 
 Label
F    1176
G     298
I     291
L      16
R       5
Name: count, dtype: int64
Label counts after cleaning the data: 
 Label
F    1176
G     298
I     291
Name: count, dtype: int64


Now we will separate our X and Y that is the input and output data. After that we will divide it into train and test sets with train_test_split. Label encoder is used to convert letters that were used in label column to numbers so that the classifier can work with that.

In [24]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Data selection

.

In [26]:
k = 80
k_best = SelectKBest(score_func=f_classif, k=k)
k_best.fit(X_train, y_train)

selected_feature_indices = k_best.get_support(indices=True)
# we have to print it like this to have the commas between the indices so that it's easy to copy and paste to Arduino IDE
print("selected features: ", X.columns[selected_feature_indices])

selected_feature_indices:  [135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
 171 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
 220 221 222 223 224 225 226 227]
selected features:  Index([135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
       149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162,
       163, 164, 165, 166, 167, 168, 169, 170, 171, 185, 186, 187, 188, 189,
       190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
       204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217,
       218, 219, 220, 221, 222, 223, 224, 225, 226, 227],
      dtype='object')


### Training the model
.

In [28]:
clf = RandomForestClassifier(max_depth=3, random_state=42)
clf.fit(X_train.iloc[:, selected_feature_indices], y_train)

y_pred = clf.predict(X_test.iloc[:, selected_feature_indices])

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

class_names = label_encoder.classes_
report = classification_report(y_test, y_pred, target_names=class_names, zero_division=0)
print('Classification Report:\n', report)

Accuracy: 0.7818696883852692
Classification Report:
               precision    recall  f1-score   support

           F       0.79      0.93      0.85       241
           G       0.76      0.55      0.64        58
           I       0.74      0.37      0.49        54

    accuracy                           0.78       353
   macro avg       0.76      0.62      0.66       353
weighted avg       0.78      0.78      0.76       353



### Exporting the Classifier

.

In [30]:
arduino_code = open("randomForest10.h", mode="w+")
arduino_code.write(port(clf))
arduino_code.close()
print("selected features: ", X.columns[selected_feature_indices])

selected features:  Index([135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
       149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162,
       163, 164, 165, 166, 167, 168, 169, 170, 171, 185, 186, 187, 188, 189,
       190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
       204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217,
       218, 219, 220, 221, 222, 223, 224, 225, 226, 227],
      dtype='object')
