## This is my project in which I compare different machine learning models. The model predicts which class (A, B, C, D, where A is the best) of body performance a person belongs to, based on personal data and exercise performance.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report, accuracy_score 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

### Data from kaggle

In [2]:
df = pd.read_csv('bodyPerformance.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13393 entries, 0 to 13392
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      13393 non-null  float64
 1   gender                   13393 non-null  object 
 2   height_cm                13393 non-null  float64
 3   weight_kg                13393 non-null  float64
 4   body fat_%               13393 non-null  float64
 5   diastolic                13393 non-null  float64
 6   systolic                 13393 non-null  float64
 7   gripForce                13393 non-null  float64
 8   sit and bend forward_cm  13393 non-null  float64
 9   sit-ups counts           13393 non-null  float64
 10  broad jump_cm            13393 non-null  float64
 11  class                    13393 non-null  object 
dtypes: float64(10), object(2)
memory usage: 1.2+ MB


### data cleaning

In [5]:
df.isna().sum()

age                        0
gender                     0
height_cm                  0
weight_kg                  0
body fat_%                 0
diastolic                  0
systolic                   0
gripForce                  0
sit and bend forward_cm    0
sit-ups counts             0
broad jump_cm              0
class                      0
dtype: int64

In [6]:
df['gender'].describe()

count     13393
unique        2
top           M
freq       8467
Name: gender, dtype: object

In [7]:
df = pd.get_dummies(df, columns=['gender'])

In [8]:
df.head()

Unnamed: 0,age,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class,gender_F,gender_M
0,27.0,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C,False,True
1,25.0,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A,False,True
2,31.0,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C,False,True
3,32.0,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B,False,True
4,28.0,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B,False,True


In [9]:
df['gender_F'] = df['gender_F'].map({True:1, False:0}) 
df['gender_M'] = df['gender_M'].map({True:1, False:0}) 

In [10]:
df.head()

Unnamed: 0,age,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class,gender_F,gender_M
0,27.0,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C,0,1
1,25.0,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A,0,1
2,31.0,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C,0,1
3,32.0,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B,0,1
4,28.0,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B,0,1


### I have divided the data into two sets: training and testing. Also, I scaled the data for better results.

In [11]:
y = df['class']
X = df.drop('class', axis =1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
X_train.shape[0]

10714

In [15]:
X_test.shape[0]

2679

### Logistic regression

In [16]:
logistic_model = LogisticRegression()

In [17]:
logistic_model.fit(X_train, y_train)

In [18]:
logistic_predicts = logistic_model.predict(X_test)

In [19]:
cr = classification_report(y_test, logistic_predicts) 
accuracy = accuracy_score(y_test, logistic_predicts)

In [20]:
print(cr)
print('accuracy: ',accuracy)

              precision    recall  f1-score   support

           A       0.70      0.71      0.71       685
           B       0.45      0.44      0.45       662
           C       0.52      0.53      0.53       650
           D       0.79      0.78      0.78       682

    accuracy                           0.62      2679
   macro avg       0.62      0.62      0.62      2679
weighted avg       0.62      0.62      0.62      2679

accuracy:  0.6188876446435237


### Bayes classificator

In [21]:
bayes_model = GaussianNB().fit(X_train, y_train)

In [22]:
bayes_predicts = bayes_model.predict(X_test)

In [23]:
report = classification_report(y_test, bayes_predicts)
accuracy = accuracy_score(y_test, bayes_predicts)

In [24]:
print(report)
print('accuracy: ',accuracy)

              precision    recall  f1-score   support

           A       0.59      0.74      0.66       685
           B       0.41      0.30      0.34       662
           C       0.46      0.46      0.46       650
           D       0.68      0.69      0.68       682

    accuracy                           0.55      2679
   macro avg       0.53      0.55      0.54      2679
weighted avg       0.54      0.55      0.54      2679

accuracy:  0.5487122060470325


### KNN Classificator

In [25]:
KNN = KNeighborsClassifier(n_neighbors=8).fit(X_train, y_train)

In [26]:
knn_predicts = KNN.predict(X_test)

In [27]:
report = classification_report(y_test, knn_predicts)
accuracy = accuracy_score(y_test, knn_predicts)

In [28]:
print(report)
print('accuracy: ',accuracy)

              precision    recall  f1-score   support

           A       0.62      0.82      0.71       685
           B       0.43      0.47      0.45       662
           C       0.56      0.51      0.54       650
           D       0.91      0.64      0.75       682

    accuracy                           0.61      2679
   macro avg       0.63      0.61      0.61      2679
weighted avg       0.64      0.61      0.61      2679

accuracy:  0.6114221724524076


### Decision tree

In [29]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [30]:
tree_model = DecisionTreeClassifier(max_depth=9).fit(X_train,y_train)

In [31]:
tree_predicts = tree_model.predict(X_test)

In [32]:
report = classification_report(y_test, tree_predicts)
accuracy = accuracy_score(y_test, tree_predicts)

In [33]:
print(report)
print('accuracy: ',accuracy)

              precision    recall  f1-score   support

           0       0.69      0.82      0.75       685
           1       0.54      0.57      0.56       662
           2       0.66      0.59      0.62       650
           3       0.88      0.76      0.81       682

    accuracy                           0.69      2679
   macro avg       0.69      0.68      0.69      2679
weighted avg       0.69      0.69      0.69      2679

accuracy:  0.68682344158268


### Neural network

In [34]:
model = keras.Sequential(
    [
        keras.Input(shape=(12,)),
        layers.Dense(128, activation="relu"),
        layers.Dense(128, activation="relu"),
        layers.Dense(128, activation="relu"),
        layers.Dense(4, activation="softmax"),
    ]
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1664      
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dense_2 (Dense)             (None, 128)               16512     
                                                                 
 dense_3 (Dense)             (None, 4)                 516       
                                                                 
Total params: 35,204
Trainable params: 35,204
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, batch_size=128, epochs=50, validation_split=0.1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1be54f42e30>

In [36]:
neural_predicts = model.predict(X_test)



In [37]:
neural_predicts_labels = np.argmax(neural_predicts, axis=1)

In [38]:
report = classification_report(y_test, neural_predicts_labels)
accuracy = accuracy_score(y_test, neural_predicts_labels)

In [39]:
print(report)
print('accuracy: ',accuracy)

              precision    recall  f1-score   support

           0       0.76      0.80      0.78       685
           1       0.60      0.69      0.64       662
           2       0.75      0.63      0.69       650
           3       0.89      0.84      0.86       682

    accuracy                           0.74      2679
   macro avg       0.75      0.74      0.74      2679
weighted avg       0.75      0.74      0.74      2679

accuracy:  0.7413213885778276


### After comparing the results, I have reached the following conclusions:

### Regression: The model performs reasonably well, but it is not the best in any metric. Regularization is an option for improving the result.
### Bayes: Performed the weakest.
### KNN: The model performed very similarly to regression. Increasing the number of neighbors could help achieve a better result, but it may also lead to overfitting.
### Tree: The model performed better than the previous ones, especially in the case of class D.
### Neural Network: This is the best model for this data, though it is comparable to the decision tree.