Let's load the datasets from the previous parts:

In [225]:
%%time

import pandas as pd
import numpy as np


global_variables = pd.read_csv('global_variables.csv', index_col=0)
SEED = global_variables.loc[0, 'SEED']
train_from_part_1 = pd.read_csv('new_datasets/train_from_part_1.csv', index_col='ID_LAT_LON_YEAR_WEEK')
test_from_part_1 = pd.read_csv('new_datasets/test_from_part_1.csv', index_col='ID_LAT_LON_YEAR_WEEK')
train_from_part_2 = pd.read_csv('new_datasets/train_from_part_2.csv', index_col='ID_LAT_LON_YEAR_WEEK')
test_from_part_2 = pd.read_csv('new_datasets/test_from_part_2.csv', index_col='ID_LAT_LON_YEAR_WEEK')

train_predictions_part_1 = pd.read_csv('new_datasets/train_predictions_part_1.csv', index_col='ID_LAT_LON_YEAR_WEEK')
test_predictions_part_1 = pd.read_csv('new_datasets/test_predictions_part_1.csv', index_col='ID_LAT_LON_YEAR_WEEK')

# Create date feature
train_from_part_2['date'] = pd.to_datetime(train_from_part_2['date'])
test_from_part_2['date'] = pd.to_datetime(test_from_part_2['date'])

top_three_values = train_from_part_1.loc[:, 'Location_enc'].drop_duplicates().sort_values(ascending = False).head(3)
top_three_locations = train_from_part_1.loc[train_from_part_1['Location_enc'].isin(top_three_values), 'Location'].drop_duplicates()

scores_df = pd.DataFrame({'Comment': [], 'Train Score': [], 'Cross-val Score': [], 'Test RMSE': []})

CPU times: total: 1.31 s
Wall time: 1.33 s


## 00. Naive stacking

In [226]:
train_selected = pd.concat([train_predictions_part_1['train_predictions_part_1'],
                            train_from_part_2[['emission_pred_03', 'emission']]], axis=1)
train_selected.columns = ['predictions_part_1', 'predictions_part_2', 'emission']

test_selected = pd.concat([test_predictions_part_1['test_predictions_part_1'],
                            test_from_part_2['emission_pred_03']], axis=1)
test_selected.columns = ['predictions_part_1', 'predictions_part_2']

train_selected.head()

Unnamed: 0_level_0,predictions_part_1,predictions_part_2,emission
ID_LAT_LON_YEAR_WEEK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ID_-0.510_29.290_2019_00,-8.325628,4.21355,3.750994
ID_-0.510_29.290_2019_01,-6.74187,4.275836,4.025176
ID_-0.510_29.290_2019_02,-25.21651,4.425881,4.231381
ID_-0.510_29.290_2019_03,-7.736105,4.508129,4.305286
ID_-0.510_29.290_2019_04,-8.036695,4.511833,4.347317


In [227]:
%%time

study_number = '00'

import xgboost as xgb

# Instantiate the regressor
model = xgb.XGBRegressor(random_state=SEED, n_jobs=-1)

# Calculate scores
from functions.get_score import get_score
train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="Naive Stacking")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 13s
Wall time: 7.1 s


In [228]:
scores_df.loc[int(study_number), 'Test RMSE'] = 37.46291
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291


## 01. Add Location+week_no_enc

In [229]:
%%time

study_number = '01'

train_selected = pd.concat([train_selected, train_from_part_1['Location+week_no_enc']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['Location+week_no_enc']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="+ Location+week_no_enc")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 16s
Wall time: 7.18 s


In [230]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,


## 02. Add Location_enc

In [231]:
%%time

study_number = '02'

train_selected = pd.concat([train_selected, train_from_part_1['Location_enc']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['Location_enc']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="+ Location_enc")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 15s
Wall time: 7.35 s


In [232]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,


In [233]:
train_selected = train_selected.drop('Location_enc', axis=1)
test_selected = test_selected.drop('Location_enc', axis=1)

## 03. Add longitude

In [234]:
%%time

study_number = '03'

train_selected = pd.concat([train_selected, train_from_part_1['longitude']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['longitude']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="1+ longitude")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 18s
Wall time: 7.32 s


In [235]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,


## 04. Add year

In [236]:
%%time

study_number = '04'

train_selected = pd.concat([train_selected, train_from_part_1['year']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['year']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="+ year")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 19s
Wall time: 7.69 s


In [237]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,
4,+ year,5.982,84.583876,


In [238]:
train_selected = train_selected.drop('year', axis=1)
test_selected = test_selected.drop('year', axis=1)

## 05. Add week_no

In [239]:
%%time

study_number = '05'

train_selected = pd.concat([train_selected, train_from_part_1['week_no']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['week_no']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="3 + week_no")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 20s
Wall time: 7.72 s


In [240]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,
4,+ year,5.982,84.583876,
5,3 + week_no,5.703851,85.698933,


In [241]:
train_selected = train_selected.drop('week_no', axis=1)
test_selected = test_selected.drop('week_no', axis=1)

## 06. Add week_no_enc

In [242]:
%%time

study_number = '06'

train_selected = pd.concat([train_selected, train_from_part_1['week_no_enc']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['week_no_enc']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="3 + week_no_enc")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 22s
Wall time: 7.82 s


In [243]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,
4,+ year,5.982,84.583876,
5,3 + week_no,5.703851,85.698933,
6,3 + week_no_enc,5.868567,85.300826,


In [244]:
train_selected = train_selected.drop('week_no_enc', axis=1)
test_selected = test_selected.drop('week_no_enc', axis=1)

train_selected.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79023 entries, ID_-0.510_29.290_2019_00 to ID_-3.299_30.301_2021_52
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   predictions_part_1    79023 non-null  float64
 1   predictions_part_2    79023 non-null  float64
 2   emission              79023 non-null  float64
 3   Location+week_no_enc  79023 non-null  float64
 4   longitude             79023 non-null  float64
dtypes: float64(5)
memory usage: 5.6+ MB


## 07. Add CarbonMonoxide_CO_column_number_density

In [245]:
%%time

study_number = '07'

train_selected = pd.concat([train_selected, train_from_part_1['CarbonMonoxide_CO_column_number_density']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['CarbonMonoxide_CO_column_number_density']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="3 + CarbonMonoxide_CO_column_number_density")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 2min 25s
Wall time: 13.4 s


In [246]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,
4,+ year,5.982,84.583876,
5,3 + week_no,5.703851,85.698933,
6,3 + week_no_enc,5.868567,85.300826,
7,3 + CarbonMonoxide_CO_column_number_density,6.029976,86.348407,


In [247]:
train_selected = train_selected.drop('CarbonMonoxide_CO_column_number_density', axis=1)
test_selected = test_selected.drop('CarbonMonoxide_CO_column_number_density', axis=1)

train_selected.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79023 entries, ID_-0.510_29.290_2019_00 to ID_-3.299_30.301_2021_52
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   predictions_part_1    79023 non-null  float64
 1   predictions_part_2    79023 non-null  float64
 2   emission              79023 non-null  float64
 3   Location+week_no_enc  79023 non-null  float64
 4   longitude             79023 non-null  float64
dtypes: float64(5)
memory usage: 5.6+ MB


## 08. Add Ozone_O3_column_number_density

In [248]:
%%time

study_number = '08'

train_selected = pd.concat([train_selected, train_from_part_1['Ozone_O3_column_number_density']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['Ozone_O3_column_number_density']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="3 + Ozone_O3_column_number_density")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 2min 27s
Wall time: 13.6 s


In [249]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,
4,+ year,5.982,84.583876,
5,3 + week_no,5.703851,85.698933,
6,3 + week_no_enc,5.868567,85.300826,
7,3 + CarbonMonoxide_CO_column_number_density,6.029976,86.348407,
8,3 + Ozone_O3_column_number_density,5.911377,84.828404,


In [250]:
train_selected = train_selected.drop('Ozone_O3_column_number_density', axis=1)
test_selected = test_selected.drop('Ozone_O3_column_number_density', axis=1)

train_selected.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79023 entries, ID_-0.510_29.290_2019_00 to ID_-3.299_30.301_2021_52
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   predictions_part_1    79023 non-null  float64
 1   predictions_part_2    79023 non-null  float64
 2   emission              79023 non-null  float64
 3   Location+week_no_enc  79023 non-null  float64
 4   longitude             79023 non-null  float64
dtypes: float64(5)
memory usage: 5.6+ MB


## 09. Add latitude

In [251]:
%%time

study_number = '09'

train_selected = pd.concat([train_selected, train_from_part_1['latitude']], axis=1)
test_selected = pd.concat([test_selected, test_from_part_1['latitude']], axis=1)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="3 + latitude")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 1min 22s
Wall time: 7.68 s


In [252]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,
4,+ year,5.982,84.583876,
5,3 + week_no,5.703851,85.698933,
6,3 + week_no_enc,5.868567,85.300826,
7,3 + CarbonMonoxide_CO_column_number_density,6.029976,86.348407,
8,3 + Ozone_O3_column_number_density,5.911377,84.828404,
9,3 + latitude,6.03592,84.644241,


In [253]:
train_selected = train_selected.drop('latitude', axis=1)
test_selected = test_selected.drop('latitude', axis=1)

train_selected.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79023 entries, ID_-0.510_29.290_2019_00 to ID_-3.299_30.301_2021_52
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   predictions_part_1    79023 non-null  float64
 1   predictions_part_2    79023 non-null  float64
 2   emission              79023 non-null  float64
 3   Location+week_no_enc  79023 non-null  float64
 4   longitude             79023 non-null  float64
dtypes: float64(5)
memory usage: 5.6+ MB


## 10. XGBoost tuned

Tuning with reduced number of estimators.

In [254]:
study_number = '10'

train_selected.to_csv('new_datasets/train_3_10.csv')
test_selected.to_csv('new_datasets/test_3_10.csv')

In [255]:
import pickle

with open('studies/3_' + study_number + '_params.pkl', 'rb') as f:
    best_params = pickle.load(f)
    
# print("Best Cross-val RMSE:", study.best_trial.value)
print("Best hyperparameters:", best_params)

Best hyperparameters: {'n_estimators': 100, 'max_depth': 3, 'max_leaves': 425, 'grow_policy': 'lossguide', 'learning_rate': 1.9522052031183201, 'booster': 'gbtree', 'tree_method': 'approx', 'gamma': 0.48475074870230545, 'min_child_weight': 0.3704222208609443, 'subsample': 0.5, 'colsample_bytree': 1.0}


In [256]:
%%time

# Instantiate the regressor
model = xgb.XGBRegressor(random_state=SEED, n_jobs=-1, **best_params)
model.set_params(n_estimators=100)

train_score, cross_score, cross_scores_std, submission = get_score(global_variables,
                                                                   train_selected,
                                                                   test_selected,
                                                                   model, scores_df,
                                                                   comment="3 + tuning")

submission.to_csv('submissions/submission_3_{}.csv'.format(study_number), index=False)

CPU times: total: 11.9 s
Wall time: 1.14 s


In [257]:
scores_df.loc[int(study_number), 'Test RMSE'] = np.nan
scores_df

Unnamed: 0,Comment,Train Score,Cross-val Score,Test RMSE
0,Naive Stacking,7.360223,86.042238,37.46291
1,+ Location+week_no_enc,6.51224,84.763073,
2,+ Location_enc,6.032389,86.400346,
3,1+ longitude,6.265567,84.504798,
4,+ year,5.982,84.583876,
5,3 + week_no,5.703851,85.698933,
6,3 + week_no_enc,5.868567,85.300826,
7,3 + CarbonMonoxide_CO_column_number_density,6.029976,86.348407,
8,3 + Ozone_O3_column_number_density,5.911377,84.828404,
9,3 + latitude,6.03592,84.644241,
