In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

In [8]:
df = pd.read_csv('../datasets/fitness_dataset.csv')
df.isnull().sum()

age                    0
height_cm              0
weight_kg              0
heart_rate             0
blood_pressure         0
sleep_hours          160
nutrition_quality      0
activity_index         0
smokes                 0
gender                 0
is_fit                 0
dtype: int64

In [9]:
df['sleep_hours'] = df['sleep_hours'].fillna(df['sleep_hours'].median())
df.isnull().sum()

age                  0
height_cm            0
weight_kg            0
heart_rate           0
blood_pressure       0
sleep_hours          0
nutrition_quality    0
activity_index       0
smokes               0
gender               0
is_fit               0
dtype: int64

In [11]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

#Nettoyage de la colonne 'smokes'
#On force tout en chaine de caracteres pour uniformiser puis on remplace
df['smokes'] = df['smokes'].astype(str)

#dictionnaire de remplacement
#car lorsqu'on fait directement l'encodage avec labelEncoder il donne 4 valeurs 0/1/2/3 
map_smokes = {
    'yes': 1,
    'no': 0,
    '1': 1,
    '0': 0
}

# On applique le mapping
df['smokes'] = df['smokes'].map(map_smokes)

# Encodage de 'gender'
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])

In [12]:
# Definition X et y
y = df['is_fit']
X = df.drop(columns=['is_fit'])

scaler = StandardScaler()
X = scaler.fit_transform(X)

# split et Scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
logi_reg = LogisticRegression()
logi_reg.fit(X_train, y_train)


pred = logi_reg.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, pred):.4f}")
print(f"precision: {precision_score(y_test, pred):.4f}")
print(f"f1_score: {f1_score(y_test, pred):.4f}")
print(f"recall: {recall_score(y_test, pred):.4f}")

Accuracy: 0.7625
precision: 0.7431
f1_score: 0.6926
recall: 0.6485


In [14]:
import joblib
joblib.dump(logi_reg, 'logistic_reg.pkl')

['logistic_reg.pkl']