In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scripts.semifinal_validation import validate_model, end_to_model
from scripts.semifinal_model import Model
sns.set()
sns.set_style('ticks')
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_excel('semifinal_data/train.xlsx')
test = pd.read_excel('semifinal_data/test.xlsx')
descr = pd.read_excel('semifinal_data/description.xlsx')

# Основная задача #

Подбор гиперпараметров:

In [3]:
model_params = {'xgb_params': {'silent': True, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 10},
               'knn_params': {'n_neighbors': 10},
               'forest_params': {'n_estimators': 400, 'n_jobs': -1, 'max_depth': 10, 'min_samples_leaf': 3,
                                'max_features': 'sqrt'}
               }
validate_model(train, model_params, False)

ВАЛИДАЦИЯ ОСНОВНОЙ ЗАДАЧИ
ITERATION 0
MAE of xgboost on train:  0.6199166687438499
MAE of xgboost on test:  5.072825899319145 

MAE of knn on train:  16.987507555913762
MAE of knn on test:  18.513940370668813 

MAE of linear regression on train:  6.284081468252732
MAE of linear regression on test:  6.127145232053183 

MAE of random_forest on train:  5.881754639784644
MAE of random_forest on test:  7.296118626745542 

MAE of meta model on train:  0.5968154176763203
MAE of meta model on test:  5.0932602140337035 

Mean absolute error on validation:  5.60165967716033 


ITERATION 1
MAE of xgboost on train:  0.5207003941286676
MAE of xgboost on test:  5.337888208922602 

MAE of knn on train:  17.23748992747784
MAE of knn on test:  18.424496373892026 

MAE of linear regression on train:  6.352632784708785
MAE of linear regression on test:  6.216985796528041 

MAE of random_forest on train:  5.847220082410925
MAE of random_forest on test:  7.640813030387894 

MAE of meta model on train:  0.4

Обучение всей модели и предсказание:

In [4]:
model, answer = end_to_model(train, test, model_params)
answer.to_csv('Pred_main.csv', index=False, header=True)

MAE of xgboost on train:  0.8154835896427219
MAE of knn on train:  15.472150593089221
MAE of linear regression on train:  5.966582909423486
MAE of random_forest on train:  5.8010618400101075
MAE of meta model on train:  0.7859276561087563


# Дополнительная задача №2 #

Сделаем все то же самое, но на основе ограниченного числа признаков.

In [5]:
red_cols = ['Protein_(g)', 'Lipid_Tot_(g)', 'Carbohydrt_(g)', 'Shrt_Desc']
train_red = train[red_cols + ['Energ_Kcal']].copy()
test_red = test[red_cols].copy()

model_params = {'xgb_params': {'silent': True, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 10},
               'knn_params': {'n_neighbors': 10},
               'forest_params': {'n_estimators': 400, 'n_jobs': -1, 'max_depth': 10, 'min_samples_leaf': 3,
                                'max_features': 'sqrt'}
               }
validate_model(train_red, model_params, True)

ВАЛИДАЦИЯ ДОПОЛНИТЕЛЬНОЙ ЗАДАЧИ №2
ITERATION 0
MAE of xgboost on train:  1.4305796193939626
MAE of xgboost on test:  6.9450427248629705 

MAE of knn on train:  10.984525488615756
MAE of knn on test:  11.460032232070912 

MAE of linear regression on train:  7.3924667176974035
MAE of linear regression on test:  7.137301667097775 

MAE of random_forest on train:  6.786235908618437
MAE of random_forest on test:  8.790694182703913 

MAE of meta model on train:  1.3559716261199533
MAE of meta model on test:  7.029481725153055 

Mean absolute error on validation:  7.249910592766098 


ITERATION 1
MAE of xgboost on train:  1.6180280461955416
MAE of xgboost on test:  7.296780720341119 

MAE of knn on train:  10.837651087832393
MAE of knn on test:  12.042304593070105 

MAE of linear regression on train:  7.660273911771995
MAE of linear regression on test:  7.093112404357367 

MAE of random_forest on train:  6.96687114981452
MAE of random_forest on test:  9.159476944566638 

MAE of meta model on 

Предсказание на дополнительную задачу

In [6]:
model, answer = end_to_model(train_red, test_red, model_params, use_subset=True)
answer.to_csv('Pred_extra_2.csv', index=False, header=True)

MAE of xgboost on train:  1.6396887190515093
MAE of knn on train:  9.690033522434243
MAE of linear regression on train:  7.3140942741585055
MAE of random_forest on train:  6.986232251202972
MAE of meta model on train:  1.5433626094456354
