Let's load the datasets from the previous parts:

In [78]:
%%time

import pandas as pd
import numpy as np


global_variables = pd.read_csv('global_variables.csv', index_col=0)
SEED = global_variables.loc[0, 'SEED']
train_from_part_1 = pd.read_csv('new_datasets/train_from_part_1.csv', index_col='ID_LAT_LON_YEAR_WEEK')
test_from_part_1 = pd.read_csv('new_datasets/test_from_part_1.csv', index_col='ID_LAT_LON_YEAR_WEEK')
train_from_part_2 = pd.read_csv('new_datasets/train_from_part_2.csv', index_col='ID_LAT_LON_YEAR_WEEK')
test_from_part_2 = pd.read_csv('new_datasets/test_from_part_2.csv', index_col='ID_LAT_LON_YEAR_WEEK')

train_predictions_part_1 = pd.read_csv('new_datasets/train_predictions_part_1.csv', index_col='ID_LAT_LON_YEAR_WEEK')
test_predictions_part_1 = pd.read_csv('new_datasets/test_predictions_part_1.csv', index_col='ID_LAT_LON_YEAR_WEEK')

# Create date feature
train_from_part_2['date'] = pd.to_datetime(train_from_part_2['date'])
test_from_part_2['date'] = pd.to_datetime(test_from_part_2['date'])

top_three_values = train_from_part_1.loc[:, 'Location_enc'].drop_duplicates().sort_values(ascending = False).head(3)
top_three_locations = train_from_part_1.loc[train_from_part_1['Location_enc'].isin(top_three_values), 'Location'].drop_duplicates()

scores_df = pd.DataFrame({'Comment': [], 'Train Score': [], 'Cross-val Score': [], 'Test RMSE': []})

CPU times: total: 1.2 s
Wall time: 1.28 s


## 00. Naive stacking

In [79]:
train_selected = pd.concat([train_predictions_part_1['train_predictions_part_1'],
                            train_from_part_2[['emission_pred_03', 'emission']]], axis=1)
train_selected.columns = ['predictions_part_1', 'predictions_part_2', 'emission']

test_selected = pd.concat([test_predictions_part_1['test_predictions_part_1'],
                            test_from_part_2['emission_pred_03']], axis=1)
test_selected.columns = ['predictions_part_1', 'predictions_part_2']

train_selected.head()

Unnamed: 0_level_0,predictions_part_1,predictions_part_2,emission
ID_LAT_LON_YEAR_WEEK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ID_-0.510_29.290_2019_00,-8.325628,4.21355,3.750994
ID_-0.510_29.290_2019_01,-6.74187,4.275836,4.025176
ID_-0.510_29.290_2019_02,-25.21651,4.425881,4.231381
ID_-0.510_29.290_2019_03,-7.736105,4.508129,4.305286
ID_-0.510_29.290_2019_04,-8.036695,4.511833,4.347317


In [80]:
%%time

study_number = '00'

import xgboost as xgb

# Instantiate the regressor
model = xgb.XGBRegressor(random_state=SEED, n_jobs=-1)

# Calculate scores
from functions.get_score import get_score
train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="Naive Stacking")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 12s
Wall time: 6.92 s


In [81]:
scores_df.loc[int(study_number), 'Test RMSE'] = 37.46291
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291


## 01. Add Location+week_no_enc

In [82]:
%%time

study_number = '01'

train_selected = pd.concat([train_selected, train_from_part_1['Location+week_no_enc']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['Location+week_no_enc']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="+ Location+week_no_enc")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 15s
Wall time: 7.24 s


In [83]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,


## 02. Add Location_enc

In [84]:
%%time

study_number = '02'

train_selected = pd.concat([train_selected, train_from_part_1['Location_enc']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['Location_enc']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="+ Location_enc")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 13s
Wall time: 7.39 s


In [85]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,


In [86]:
train_selected = train_selected.drop('Location_enc', axis=1)
test_selected = test_selected.drop('Location_enc', axis=1)

## 03. Add longitude

In [87]:
%%time

study_number = '03'

train_selected = pd.concat([train_selected, train_from_part_1['longitude']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['longitude']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="1+ longitude")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 13s
Wall time: 7.52 s


In [88]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,


## 04. Add year

In [89]:
%%time

study_number = '04'

train_selected = pd.concat([train_selected, train_from_part_1['year']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['year']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="+ year")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 18s
Wall time: 7.54 s


In [90]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,
4,+ year,5.982,84.583876,


In [91]:
train_selected = train_selected.drop('year', axis=1)
test_selected = test_selected.drop('year', axis=1)

## 06. Add week_no

In [92]:
%%time

study_number = '05'

train_selected = pd.concat([train_selected, train_from_part_1['week_no']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['week_no']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="3 + week_no")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 19s
Wall time: 7.67 s


In [93]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,
4,+ year,5.982,84.583876,
5,2 + week_no,5.703851,85.698933,


In [94]:
train_selected = train_selected.drop('week_no', axis=1)
test_selected = test_selected.drop('week_no', axis=1)

## 06. Add week_no_enc

In [95]:
%%time

study_number = '06'

train_selected = pd.concat([train_selected, train_from_part_1['week_no_enc']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['week_no_enc']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="3 + week_no_enc")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 18s
Wall time: 8.02 s


In [97]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,
4,+ year,5.982,84.583876,
5,2 + week_no,5.703851,85.698933,
6,2 + week_no_enc,5.868567,85.300826,
