# Библиотеки


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

## Загрузка данных


In [None]:
df = pd.read_csv("../data/raw/Lifestyle_and_Health_Risk_Prediction_Synthetic_Dataset.csv")

## Обработка пропусков

In [None]:
num_cols = ['Age', 'BMI', 'SleepHours', 'StressLevel']
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

cat_cols = ['Gender', 'Smoking', 'Alcohol', 'PhysicalActivity']
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

## Важность признаков

In [None]:
importances = rf.feature_importances_
features = X.columns

sns.barplot(x=importances, y=features)
plt.title("Важность признаков")
plt.show()

## Кодирование категориальных признаков


In [None]:
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

## Масштабирование числовых признаков


In [None]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

## Разделение данных


In [None]:
X = df.drop('HealthRisk', axis=1)
y = df['HealthRisk']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Модели


In [None]:
lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Обучение


In [None]:
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Оценка качества


In [None]:
models = {'LogReg': lr, 'RandomForest': rf}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(name)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# Сохранение лучшей модели


In [None]:
joblib.dump(rf, "../models/best_model.pkl")
lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Обучение


In [None]:
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Оценка качества


In [None]:
models = {'LogReg': lr, 'RandomForest': rf}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(name)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# Сохранение лучшей модели

In [None]:
joblib.dump(rf, "../models/best_model.pkl")