# KNN Model for predicting Age with Energy - Vibration

### Import libraries

In [1]:
import os
import glob
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.io import wavfile
from scipy.fft import fft
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy.signal import find_peaks
import librosa
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score
# import openpyxl

# Main

In [2]:
df_combined=pd.read_csv("../../All_Data_combined.csv")
df_combined.head

<bound method NDFrame.head of                                             Raw Signal  \
0    [[-0.01573961]\n [-0.02470204]\n [-0.0184555 ]...   
1    [[ 0.10185837]\n [ 0.10090781]\n [ 0.08882211]...   
2    [[-0.02701054]\n [-0.01506063]\n [-0.02510942]...   
3    [[-0.05687327]\n [-0.06352719]\n [-0.05049093]...   
4    [[-0.08674804]\n [-0.04994775]\n [-0.05334261]...   
..                                                 ...   
790  [[ 0.02609318]\n [ 0.0115658 ]\n [ 0.01821852]...   
791  [[ 0.00491307]\n [-0.00717045]\n [ 0.00532038]...   
792  [[-0.01273702]\n [ 0.00532038]\n [-0.01409472]...   
793  [[-0.02101899]\n [-0.02590671]\n [-0.0394837 ]...   
794  [[ 0.03478245]\n [ 0.01278773]\n [ 0.02079815]...   

                                              Spectrum  \
0    [ 5415.43531949  3187.67362463  2567.48180365 ...   
1    [ 3574.00394563  2253.1726773   2189.48902092 ...   
2    [ 4336.31750598  2636.18307151  2297.29013228 ...   
3    [7218.04796598 4371.60331682 3234.47

In [10]:
# Determine the minimum number of samples for each age group
min_samples_per_age = df_combined['Age'].value_counts().min()

# Create a new DataFrame with equal samples for each age group
df_sorted_homogeneous = (
    df_combined.groupby('Age')
    .apply(lambda x: x.sample(n=min_samples_per_age, random_state=42))
    .reset_index(drop=True)
)

df_sorted_homogeneous

# Verify the distribution
print(df_sorted_homogeneous['Age'].value_counts())


# Combine features into X
X = df_sorted_homogeneous["Enveloppe"].to_numpy()
X = np.array([np.fromstring(e.strip('[]'), sep=', ') for e in X])
# Normalize X
X = (X - X.mean(axis=0)) / X.std(axis=0)

# Encode string labels into integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_sorted_homogeneous["Age"])

# # print(y)
# print(len(X),len(y))

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X,y)


Age
1    216
2    216
3    216
Name: count, dtype: int64
[[-1.30800198 -1.9030978  -1.06542905 -0.92105464 -0.77740909 -0.73114023]
 [ 0.0692612  -1.14991037  0.63834347 -0.3334012  -0.73530638 -0.57476807]
 [-1.86698891 -1.71681149 -1.44441224 -1.01683296 -0.98704462 -0.80640209]
 ...
 [ 1.49668023 -0.36478088 -0.28196934 -0.91634833  0.59700198 -0.20719339]
 [ 1.08170702 -0.33630147  0.22734256 -0.38854584 -0.665755   -0.57318171]
 [ 1.52014609 -0.19128955 -0.06833439 -0.70212935  0.47504085  0.50947145]] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [11]:
# Définir les bonnes valeurs trouvé
n_neighbors = 10  # Nombre de voisins
weights_options = ['uniform']  # Méthode de pondération
metric_options = ['manhattan']  # Métriques


# Définir les plages de valeurs pour les hyperparamètres
n_neighbors_range = range(1, 21)  # Nombre de voisins
weights_options = ['uniform', 'distance']  # Méthode de pondération
metric_options = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']  # Métriques

# Liste pour stocker les résultats
results = []


# Tester toutes les combinaisons d'hyperparamètres
for n_neighbors in n_neighbors_range:
    for weights in weights_options:
        for metric in metric_options:
            # Créer et entraîner le modèle KNN
            knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric)
            knn.fit(X_train, y_train)

            # Évaluer sur l'ensemble de test
            y_pred = knn.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)

            # Évaluer sur l'ensemble d'entraînement
            y_train_pred = knn.predict(X_train)
            accuracy_train = accuracy_score(y_train, y_train_pred)

            # Ajouter les résultats à la liste
            results.append({
                'n_neighbors': n_neighbors,
                'weights': weights,
                'metric': metric,
                'accuracy_train': accuracy_train,
                'accuracy_test': accuracy_test
            })

# Convertir les résultats en DataFrame
results_df = pd.DataFrame(results)



print(results_df)

     n_neighbors   weights     metric  accuracy_train  accuracy_test
0              1   uniform  euclidean        1.000000       0.707692
1              1   uniform  manhattan        1.000000       0.746154
2              1   uniform  chebyshev        1.000000       0.700000
3              1   uniform  minkowski        1.000000       0.707692
4              1  distance  euclidean        1.000000       0.707692
..           ...       ...        ...             ...            ...
155           20   uniform  minkowski        0.687259       0.638462
156           20  distance  euclidean        1.000000       0.684615
157           20  distance  manhattan        1.000000       0.700000
158           20  distance  chebyshev        1.000000       0.646154
159           20  distance  minkowski        1.000000       0.684615

[160 rows x 5 columns]
