In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
# Загрузка данных
exam_data = pd.read_csv('jamb_exam_results.csv')

# Приводим имена столбцов к нижнему регистру и заменяем пробелы на подчеркивания
exam_data.columns = exam_data.columns.str.lower().str.replace(' ', '_')

# Удаляем столбец student_id
exam_data = exam_data.drop(columns=['student_id'])

# Заполняем пропущенные значения нулями
exam_data = exam_data.fillna(0)

# Разделяем данные на признаки (X) и целевую переменную (y)
X_features = exam_data.drop(columns=['jamb_score'])
y_target = exam_data['jamb_score']


In [3]:
# Разбиваем данные на обучающую, валидационную и тестовую выборки
X_train, X_temp, y_train, y_temp = train_test_split(X_features, y_target, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Преобразуем данные с помощью DictVectorizer
vectorizer = DictVectorizer(sparse=True)
X_train_transformed = vectorizer.fit_transform(X_train.to_dict(orient='records'))
X_val_transformed = vectorizer.transform(X_val.to_dict(orient='records'))
X_test_transformed = vectorizer.transform(X_test.to_dict(orient='records'))

# Обучаем модель дерева решений
tree_model = DecisionTreeRegressor(max_depth=1, random_state=42)
tree_model.fit(X_train_transformed, y_train)

# Находим признак, использованный для первого разбиения
first_split_feature = vectorizer.feature_names_[tree_model.tree_.feature[0]]
print(f"Feature used for the first split: {first_split_feature}")

Feature used for the first split: study_hours_per_week


In [4]:
# Обучаем модель случайного леса
forest_model = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1)
forest_model.fit(X_train_transformed, y_train)

# Оцениваем модель на валидационных данных
y_val_predicted = forest_model.predict(X_val_transformed)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_predicted))
print(f"Validation RMSE: {rmse_val}")

Validation RMSE: 42.173933892868


In [5]:
# Поиск оптимального количества деревьев
rmse_results = {}
for n_trees in range(10, 201, 10):
    forest_model = RandomForestRegressor(n_estimators=n_trees, random_state=42, n_jobs=-1)
    forest_model.fit(X_train_transformed, y_train)
    y_val_predicted = forest_model.predict(X_val_transformed)
    rmse_results[n_trees] = np.sqrt(mean_squared_error(y_val, y_val_predicted))

# Выводим результаты по количеству деревьев
for n_trees, rmse_score in rmse_results.items():
    print(f"n_estimators: {n_trees}, RMSE: {rmse_score:.3f}")


# Находим лучшее значение max_depth
max_depth_values = [10, 15, 20, 25]
depth_performance = {}

for max_depth in max_depth_values:
    rmse_scores_list = []
    for n_trees in range(10, 201, 10):
        forest_model = RandomForestRegressor(n_estimators=n_trees, max_depth=max_depth, random_state=42, n_jobs=-1)
        forest_model.fit(X_train_transformed, y_train)
        y_val_predicted = forest_model.predict(X_val_transformed)
        rmse_scores_list.append(np.sqrt(mean_squared_error(y_val, y_val_predicted)))
    depth_performance[max_depth] = np.mean(rmse_scores_list)

# Находим значение max_depth с минимальным средним RMSE
best_max_depth = min(depth_performance, key=depth_performance.get)
print(f"Best max_depth: {best_max_depth}")

n_estimators: 10, RMSE: 42.174
n_estimators: 20, RMSE: 41.528
n_estimators: 30, RMSE: 41.551
n_estimators: 40, RMSE: 41.509
n_estimators: 50, RMSE: 41.309
n_estimators: 60, RMSE: 41.225
n_estimators: 70, RMSE: 41.132
n_estimators: 80, RMSE: 41.091
n_estimators: 90, RMSE: 41.010
n_estimators: 100, RMSE: 41.087
n_estimators: 110, RMSE: 41.094
n_estimators: 120, RMSE: 41.047
n_estimators: 130, RMSE: 40.974
n_estimators: 140, RMSE: 40.981
n_estimators: 150, RMSE: 40.961
n_estimators: 160, RMSE: 40.957
n_estimators: 170, RMSE: 40.933
n_estimators: 180, RMSE: 40.927
n_estimators: 190, RMSE: 40.911
n_estimators: 200, RMSE: 40.910
Best max_depth: 10


In [6]:
# Обучаем финальную модель случайного леса
final_forest_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=42, n_jobs=-1)
final_forest_model.fit(X_train_transformed, y_train)

# Определяем важность признаков
feature_importances = final_forest_model.feature_importances_
most_important_feature = vectorizer.feature_names_[np.argmax(feature_importances)]
print(f"Most important feature: {most_important_feature}")


Most important feature: study_hours_per_week


In [7]:
# Создание DMatrix для XGBoost
dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
dval = xgb.DMatrix(X_val_transformed, label=y_val)

# Создаем watchlist
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Параметры для XGBoost модели
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 42,
    'verbosity': 1
}

# Обучение XGBoost с eta = 0.3
xgb_model_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Изменяем eta на 0.1 и повторяем обучение
xgb_params['eta'] = 0.1
xgb_model_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Выводим RMSE
print(f"Best validation RMSE with eta=0.3: {xgb_model_03.best_score}")
print(f"Best validation RMSE with eta=0.1: {xgb_model_01.best_score}")

[0]	train-rmse:42.71579	eval-rmse:45.17452
[1]	train-rmse:39.82920	eval-rmse:43.33307
[2]	train-rmse:37.76334	eval-rmse:42.21338
[3]	train-rmse:36.28364	eval-rmse:41.85801
[4]	train-rmse:35.07326	eval-rmse:41.33436
[5]	train-rmse:34.19555	eval-rmse:41.08693
[6]	train-rmse:33.44294	eval-rmse:40.91599
[7]	train-rmse:32.67600	eval-rmse:40.88081
[8]	train-rmse:32.01974	eval-rmse:40.97827
[9]	train-rmse:31.53993	eval-rmse:40.88101
[10]	train-rmse:31.06525	eval-rmse:40.93403
[11]	train-rmse:30.73302	eval-rmse:40.99632
[12]	train-rmse:30.37063	eval-rmse:40.99476
[13]	train-rmse:29.88920	eval-rmse:41.01079
[14]	train-rmse:29.51041	eval-rmse:40.97999
[15]	train-rmse:29.18156	eval-rmse:41.04403
[16]	train-rmse:29.07860	eval-rmse:41.05910
[17]	train-rmse:28.82466	eval-rmse:41.12942
[0]	train-rmse:45.50472	eval-rmse:47.29604
[1]	train-rmse:44.14512	eval-rmse:46.35598
[2]	train-rmse:42.95150	eval-rmse:45.45687
[3]	train-rmse:41.89408	eval-rmse:44.71852
[4]	train-rmse:40.93352	eval-rmse:44.07986
[5]