<a href="https://colab.research.google.com/github/KA1exe1AK/AI_course/blob/main/%D0%9B%D0%B0%D0%B1%D0%B03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

data = pd.read_csv("exams.csv")

target = 'math score'

# Разделение на признаки и целевую переменную
X = data.drop(columns=[target])
y = data[target]
data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group A,high school,standard,completed,67,67,63
1,female,group D,some high school,free/reduced,none,40,59,55
2,male,group E,some college,free/reduced,none,59,60,50
3,male,group B,high school,standard,none,77,78,68
4,male,group E,associate's degree,standard,completed,78,73,68
...,...,...,...,...,...,...,...,...
995,male,group C,high school,standard,none,73,70,65
996,male,group D,associate's degree,free/reduced,completed,85,91,92
997,female,group C,some high school,free/reduced,none,32,35,41
998,female,group C,some college,standard,none,73,74,82


In [26]:
print(f"Тип данных целевой переменной: {y.dtype}")
# Задача регрессии

Тип данных целевой переменной: int64


## Разделение на тренировочную и тестовую выборки

In [9]:
# (80% на обучение, 20% на тест)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Проверка и обработка пропусков в данных

In [37]:
# Определяем числовые и категориальные признаки отдельно
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

print("Пропуски:")
print(X.isna().sum())
print(y.isna().sum())


Пропуски:
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
reading score                  0
writing score                  0
dtype: int64
0


## Нормализация численных переменных

In [39]:
# Нормализация
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])


## Кодирование категориальных признаков

In [42]:
from sklearn.preprocessing import LabelEncoder
# Применяем Label Encoding к каждой категориальной колонке
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])  # Кодируем данные в train
    X_test[col] = le.transform(X_test[col])  # Кодируем данные в test
    label_encoders[col] = le  # Сохраняем кодировщик для каждой колонки

# Проверим результат
print(X_train.head())
print(X_test.head())


     reading score  writing score  gender_female  gender_male  \
29       -1.028768      -1.081265           True        False   
535       1.233956       1.447820           True        False   
695      -0.411661      -0.367933          False         True   
557      -0.068825      -0.367933          False         True   
836       0.068310      -0.173388           True        False   

     race/ethnicity_group A  race/ethnicity_group B  race/ethnicity_group C  \
29                     True                   False                   False   
535                   False                   False                   False   
695                   False                    True                   False   
557                   False                   False                    True   
836                   False                   False                    True   

     race/ethnicity_group D  race/ethnicity_group E  \
29                    False                   False   
535                    T

# Обучение моделей


## Линейная регрессия

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Обучение модели
lr = LinearRegression()
lr.fit(X_train, y_train)

# Предсказание
y_pred_lr = lr.predict(X_test)


## Дерево решений

In [27]:
from sklearn.tree import DecisionTreeRegressor

# Обучение с разной глубиной
dt = DecisionTreeRegressor(max_depth=5)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)


## K-ближайших соседей


In [28]:
from sklearn.neighbors import KNeighborsRegressor

# Обучение с k=5
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)


## Случайный лес


In [29]:
from sklearn.ensemble import RandomForestRegressor

# Обучение с 100 деревьями
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


# Оценка моделей

In [30]:
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mape = (abs((y_true - y_pred) / y_true)).mean() * 100
    print(f"{model_name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, MAPE: {mape:.2f}%")

# Оценка каждой модели
evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_dt, "Decision Tree")
evaluate_model(y_test, y_pred_knn, "KNN")
evaluate_model(y_test, y_pred_rf, "Random Forest")


Linear Regression - MAE: 4.56, RMSE: 5.50, MAPE: 7.17%
Decision Tree - MAE: 5.43, RMSE: 7.07, MAPE: 8.83%
KNN - MAE: 5.37, RMSE: 6.60, MAPE: 8.70%
Random Forest - MAE: 4.88, RMSE: 5.98, MAPE: 7.81%




# Grid Search и кросс-валидация

In [32]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# Гиперпараметры для Decision Tree
param_grid_dt = {
    'max_depth': [3, 5, 10, 15],
    'criterion': ['squared_error', 'absolute_error']
}

# Гиперпараметры для KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 10, 15],
    'weights': ['uniform', 'distance']
}

# Гиперпараметры для Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15]
}

# Функция для поиска лучших параметров и оценки модели
def grid_search_and_evaluate(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best Parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Применение для Decision Tree
best_dt = grid_search_and_evaluate(DecisionTreeRegressor(), param_grid_dt, X_train, y_train)

# Применение для KNN
best_knn = grid_search_and_evaluate(KNeighborsRegressor(), param_grid_knn, X_train, y_train)

# Применение для Random Forest
best_rf = grid_search_and_evaluate(RandomForestRegressor(), param_grid_rf, X_train, y_train)

# Оценка качества с кросс-валидацией
for model, name in zip([best_dt, best_knn, best_rf], ['Decision Tree', 'KNN', 'Random Forest']):
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
    print(f"{name} CV MAE: {-scores.mean():.2f}")


Best Parameters: {'criterion': 'squared_error', 'max_depth': 5}
Best Parameters: {'n_neighbors': 15, 'weights': 'uniform'}
Best Parameters: {'max_depth': 5, 'n_estimators': 200}
Decision Tree CV MAE: 5.69
KNN CV MAE: 5.46
Random Forest CV MAE: 5.07
