# ML модель 
#### Модель определения болезни по имеющимся симптомам

## Загрузка и маппинг датасета

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('..//data//raw//Disease_symptom_and_patient_profile_dataset.csv')

data = data.astype({
    "Disease": "string",
    "Fever": "string",
    "Cough": "string",
    "Fatigue": "string",
    "Difficulty Breathing": "string",
    "Age": "int64",
    "Gender": "string",
    "Blood Pressure": "string",
    "Cholesterol Level": "string",
    "Outcome Variable": "string"
})

# Gender (Male - 1; Female - 0)
data['Gender'] = data['Gender'].map({'Male':'1', 'Female':'0'}).astype('int64')

# Symptoms (Yes - 1; No - 0)
data['Fever'] = data['Fever'].map({'Yes':'1', 'No':'0'}).astype('int64')
data['Cough'] = data['Cough'].map({'Yes':'1', 'No':'0'}).astype('int64')
data['Fatigue'] = data['Fatigue'].map({'Yes':'1', 'No':'0'}).astype('int64')
data['Difficulty Breathing'] = data['Difficulty Breathing'].map({'Yes':'1', 'No':'0'}).astype('int64')

# Symptoms (High - 2; Normal - 1; Low - 0)
data['Blood Pressure'] = data['Blood Pressure'].map({'High':'2', 'Normal':'1', 'Low':'0'}).astype('int64')
data['Cholesterol Level'] = data['Cholesterol Level'].map({'High':'2', 'Normal':'1', 'Low':'0'}).astype('int64')

# Symptoms (Positive - 1; Negative - 0)
data['Outcome Variable'] = data['Outcome Variable'].map({'Positive':'1', 'Negative':'0'}).astype('int64')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Disease               349 non-null    string
 1   Fever                 349 non-null    int64 
 2   Cough                 349 non-null    int64 
 3   Fatigue               349 non-null    int64 
 4   Difficulty Breathing  349 non-null    int64 
 5   Age                   349 non-null    int64 
 6   Gender                349 non-null    int64 
 7   Blood Pressure        349 non-null    int64 
 8   Cholesterol Level     349 non-null    int64 
 9   Outcome Variable      349 non-null    int64 
dtypes: int64(9), string(1)
memory usage: 27.4 KB


***Создаем дублирование (с небольшим шумом) классов редких болезней***

In [3]:
class_counts = data['Disease'].value_counts()
rare_classes = class_counts[class_counts <= 2].index # Болезни, которые встречаются 1 или 2 раза
data_augmented = data.copy()
for rare_class in rare_classes:
    sample = data[data['Disease'] == rare_class].copy()
    if 'Age' in sample.columns:
        sample['Age'] += np.random.randint(-2, 3, size=len(sample)) # Создаем шум на столбце возраста
    data_augmented = pd.concat([data_augmented, sample, sample], ignore_index=True)  # Дублируем 2 раза

***Отделяем целевую переменную (target) от признаков (features)***

In [4]:
x = data_augmented.drop(columns=['Disease'])
y = data_augmented['Disease']
le = LabelEncoder()
y_encoded = le.fit_transform(y)

## Разделение датасета на test и train

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

***Масштабируем признаки с помощью StandardScaler***

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

## Кросс-валидация модели RandomForest

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42)
rf_model.fit(X_train_scaled, y_train)

rf_cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=2, scoring='accuracy')
print(f"RandomForest средняя точность (cross-validation): {rf_cv_scores.mean():.4f}")

RandomForest средняя точность (cross-validation): 0.5605


***Визуализация матрицы ошибок***

In [8]:
# y_pred_rf = best_rf_model.predict(X_test_scaled) 

# # Оценка точности
# acc_rf = accuracy_score(y_test, y_pred_rf)
# print(f"Accuracy на тесте (RandomForest): {acc_rf:.4f}")

# # Подробный отчёт
# print("Classification Report (RandomForest):")
# print(classification_report(y_test, y_pred_rf, target_names=le.classes_))
