<a href="https://colab.research.google.com/github/KA1exe1AK/AI_course/blob/main/%D0%9B%D0%B0%D0%B1%D0%B03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

data = pd.read_csv("exams.csv")

target = 'math score'

# Разделение на признаки и целевую переменную
X = data.drop(columns=[target])
y = data[target]
data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group A,high school,standard,completed,67,67,63
1,female,group D,some high school,free/reduced,none,40,59,55
2,male,group E,some college,free/reduced,none,59,60,50
3,male,group B,high school,standard,none,77,78,68
4,male,group E,associate's degree,standard,completed,78,73,68
...,...,...,...,...,...,...,...,...
995,male,group C,high school,standard,none,73,70,65
996,male,group D,associate's degree,free/reduced,completed,85,91,92
997,female,group C,some high school,free/reduced,none,32,35,41
998,female,group C,some college,standard,none,73,74,82


In [77]:
print(f"Тип данных целевой переменной: {y.dtype}")
# Задача регрессии

Тип данных целевой переменной: int64


## Разделение на тренировочную и тестовую выборки

In [78]:
# (80% на обучение, 20% на тест)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Проверка и обработка пропусков в данных

In [79]:
# Определяем числовые и категориальные признаки отдельно
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

print("Пропуски:")
print(X.isna().sum())
print(y.isna().sum())


Пропуски:
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
reading score                  0
writing score                  0
dtype: int64
0


## Нормализация численных переменных

In [80]:
# Нормализация
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])


## Кодирование категориальных признаков

In [81]:
from sklearn.preprocessing import LabelEncoder
# Label Encoding
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le  # Сохраняем кодировщик для каждой колонки

# Проверим результат
print(X_train.head())
print(X_test.head())


     gender  race/ethnicity  parental level of education  lunch  \
29        0               0                            5      0   
535       0               3                            1      0   
695       1               1                            5      1   
557       1               2                            4      0   
836       0               2                            5      0   

     test preparation course  reading score  writing score  
29                         1      -1.028768      -1.081265  
535                        1       1.233956       1.447820  
695                        0      -0.411661      -0.367933  
557                        1      -0.068825      -0.367933  
836                        0       0.068310      -0.173388  
     gender  race/ethnicity  parental level of education  lunch  \
521       0               2                            1      1   
737       1               3                            0      1   
740       1               1   

# Обучение моделей


## Линейная регрессия

In [82]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Обучение модели
lr = LinearRegression()
lr.fit(X_train, y_train)

# Предсказание
y_pred_lr = lr.predict(X_test)


## Дерево решений

In [83]:
from sklearn.tree import DecisionTreeRegressor

# Обучение с разной глубиной
dt = DecisionTreeRegressor(max_depth=5)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)


## K-ближайших соседей


In [84]:
from sklearn.neighbors import KNeighborsRegressor

# Обучение с k=5
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)


## Случайный лес


In [86]:
from sklearn.ensemble import RandomForestRegressor

# Обучение с 100 деревьями
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


# Оценка моделей

In [88]:
# Оценка модели
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = (abs((y_true - y_pred) / y_true)).mean() * 100
    print(f"{model_name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, MAPE: {mape:.2f}%")

# Оценка на тренировочной выборке
print("Тренировочные данные:")
evaluate_model(y_train, y_train_pred_lr, "Линейная регрессия")
evaluate_model(y_train, y_train_pred_dt, "Дерево решений")
evaluate_model(y_train, y_train_pred_knn, "KNN")
evaluate_model(y_train, y_train_pred_rf, "Случайный лес")

# Оценка на тестовой выборке
print("\nТестовые данные:")
evaluate_model(y_test, y_pred_lr, "Линейная регрессия")
evaluate_model(y_test, y_pred_dt, "Дерево решений")
evaluate_model(y_test, y_pred_knn, "KNN")
evaluate_model(y_test, y_pred_rf, "Случайный лес")


Тренировочные данные:
Линейная регрессия - MAE: 4.64, RMSE: 5.79, MAPE: 7.69%
Дерево решений - MAE: 4.76, RMSE: 5.94, MAPE: 7.73%
KNN - MAE: 4.73, RMSE: 5.92, MAPE: 8.05%
Случайный лес - MAE: 1.93, RMSE: 2.44, MAPE: 3.23%

Тестовые данные:
Линейная регрессия - MAE: 4.66, RMSE: 5.66, MAPE: 7.32%
Дерево решений - MAE: 5.49, RMSE: 7.10, MAPE: 8.93%
KNN - MAE: 5.41, RMSE: 6.62, MAPE: 8.68%
Случайный лес - MAE: 5.02, RMSE: 6.12, MAPE: 8.04%


#### Лиейная регрессия показала лучший результат
#### Случайный лес переобучен

# Grid Search и кросс-валидация

In [92]:
from sklearn.preprocessing import StandardScaler

# Стандартизация числовых признаков
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

from sklearn.model_selection import GridSearchCV
import numpy as np

# Функция для оценки моделей
def evaluate_model(y_true, y_pred, model_name, best_params=None):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = (abs((y_true - y_pred) / y_true)).mean() * 100
    print(f"{model_name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, MAPE: {mape:.2f}%")

    if best_params:
        print(f"Лучшие параметры для {model_name}: {best_params}")

# Оптимизация и оценка Decision Tree
param_grid_dt = {
    'max_depth': [3, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_dt = GridSearchCV(DecisionTreeRegressor(), param_grid_dt, cv=5, scoring='neg_mean_squared_error')
grid_dt.fit(X_train, y_train)
best_dt = grid_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)
evaluate_model(y_test, y_pred_dt, "Дерево решений", best_params=grid_dt.best_params_)

# Оптимизация и оценка KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance']
}
grid_knn = GridSearchCV(KNeighborsRegressor(), param_grid_knn, cv=5, scoring='neg_mean_squared_error')
grid_knn.fit(X_train, y_train)
best_knn = grid_knn.best_estimator_
y_pred_knn = best_knn.predict(X_test)
evaluate_model(y_test, y_pred_knn, "KNN", best_params=grid_knn.best_params_)

# Оптимизация и оценка Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}
grid_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
evaluate_model(y_test, y_pred_rf, "Случайный лес", best_params=grid_rf.best_params_)


Дерево решений - MAE: 5.24, RMSE: 6.69, MAPE: 8.30%
Лучшие параметры для Дерево решений: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
KNN - MAE: 5.19, RMSE: 6.45, MAPE: 8.37%
Лучшие параметры для KNN: {'n_neighbors': 10, 'weights': 'uniform'}
Случайный лес - MAE: 4.77, RMSE: 5.92, MAPE: 7.66%
Лучшие параметры для Случайный лес: {'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
