In [2]:
from IPython.display import display, HTML;  display(HTML('<style>.container { width:90% !important; }</style>'))

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

In [3]:
train_df = pd.read_csv("data/train.csv")
train_df.head(5)

Unnamed: 0,id,date,feature_AA,feature_AB,feature_BA,feature_BB,feature_CA,feature_CB,Temperature
0,0,2016-07-01 00:00:00,5.827,2.009,1.599,0.462,4.203,1.34,30.531
1,1,2016-07-01 00:15:00,5.76,2.076,1.492,0.426,4.264,1.401,30.459999
2,2,2016-07-01 00:30:00,5.76,1.942,1.492,0.391,4.234,1.31,30.038
3,3,2016-07-01 00:45:00,5.76,1.942,1.492,0.426,4.234,1.31,27.013
4,4,2016-07-01 01:00:00,5.693,2.076,1.492,0.426,4.142,1.371,27.787001


In [4]:
train_df.isna().sum()

id             0
date           0
feature_AA     0
feature_AB     0
feature_BA     0
feature_BB     0
feature_CA     0
feature_CB     0
Temperature    0
dtype: int64

In [5]:
train_valid_X = train_df.drop(columns=['id', 'date', 'Temperature'])
train_valid_X.head(2)

Unnamed: 0,feature_AA,feature_AB,feature_BA,feature_BB,feature_CA,feature_CB
0,5.827,2.009,1.599,0.462,4.203,1.34
1,5.76,2.076,1.492,0.426,4.264,1.401


In [6]:
train_valid_y = train_df['Temperature']
train_valid_y.head()

0    30.531000
1    30.459999
2    30.038000
3    27.013000
4    27.787001
Name: Temperature, dtype: float64

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(train_valid_X, train_valid_y, 
                                                    test_size=0.25, random_state=1)

In [12]:
model = RandomForestRegressor()

params = {
    'n_estimators': [100, 200, 300],
    'n_jobs': [-1],
    'random_state': [1]
}

gs = GridSearchCV(model, params, cv=10)
gs.fit(X_train, y_train)

In [13]:
y_pred = gs.predict(X_valid)
mean_absolute_error(y_valid, y_pred)

4.038773701321856

In [14]:
gs.best_params_

{'n_estimators': 300, 'n_jobs': -1, 'random_state': 1}

In [None]:
## kernel interrupted everytime I tried any parameters here
model = RandomForestRegressor()

params = {
    'n_estimators': [500, 1000],
    'n_jobs': [-1],
    'random_state': [1]
}

gs = GridSearchCV(model, params, cv=10)
gs.fit(X_train, y_train)

In [8]:
model = RandomForestRegressor(n_estimators=1000, random_state=0, n_jobs=-1)
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_valid)
mean_absolute_error(y_valid, y_pred)

4.037001332376974

In [12]:
model = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, max_features='sqrt', max_depth=2,
                                  random_state=0, loss='absolute_error')

model.fit(X_train, y_train)
y_pred = model.predict(X_valid)

mean_absolute_error(y_valid, y_pred)

5.561582642260577

In [13]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Using cached xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
Installing collected packages: xgboost
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
evalml 0.83.0 requires catboost>=1.1.1, which is not installed.
evalml 0.83.0 requires category-encoders<=2.5.1.post0,>=2.2.2, which is not installed.
evalml 0.83.0 requires kaleido==0.1.0, which is not installed.
evalml 0.83.0 requires lightgbm>=4.0.0, which is not installed.
evalml 0.83.0 requires lime>=0.2.0.1, which is not installed.
evalml 0.83.0 requires nlp-primitives>=2.9.0, which is not installed.
evalml 0.83.0 requires shap>=0.42.0, which is not installed.
evalml 0.83.0 requires sktime>=0.21.0, which is not installed.
evalml 0.83.0 requires texttable>=1.6.2, which is not installed.
evalml 0.8

In [17]:
model = xgb.XGBRegressor()

params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 500, 1000]
}

gs = GridSearchCV(model, params, cv=10)
gs.fit(X_train, y_train)

In [19]:
y_pred = gs.predict(X_valid)
mean_absolute_error(y_valid, y_pred)

4.228354991478789

### Best model was:
model = RandomForestRegressor(n_estimators=1000, random_state=0, n_jobs=-1)
model.fit(X_train, y_train)

In [8]:
model = RandomForestRegressor(n_estimators=300, random_state=0, n_jobs=-1)
model.fit(X_train, y_train)

In [11]:
test_df = pd.read_csv("data/test.csv")
test_df.head(2)

Unnamed: 0,id,date,feature_AA,feature_AB,feature_BA,feature_BB,feature_CA,feature_CB
0,64320,2018-05-02 00:00:00,14.133,2.076,9.346,-0.036,4.538,1.279
1,64321,2018-05-02 00:15:00,12.123,1.541,7.818,-0.746,4.538,1.249


In [12]:
test_X = test_df[[feature for feature in test_df.columns if feature.startswith("feature")]]
test_X.head(2)

Unnamed: 0,feature_AA,feature_AB,feature_BA,feature_BB,feature_CA,feature_CB
0,14.133,2.076,9.346,-0.036,4.538,1.279
1,12.123,1.541,7.818,-0.746,4.538,1.249


In [13]:
model.estimators_

[DecisionTreeRegressor(max_features=1.0, random_state=209652396),
 DecisionTreeRegressor(max_features=1.0, random_state=398764591),
 DecisionTreeRegressor(max_features=1.0, random_state=924231285),
 DecisionTreeRegressor(max_features=1.0, random_state=1478610112),
 DecisionTreeRegressor(max_features=1.0, random_state=441365315),
 DecisionTreeRegressor(max_features=1.0, random_state=1537364731),
 DecisionTreeRegressor(max_features=1.0, random_state=192771779),
 DecisionTreeRegressor(max_features=1.0, random_state=1491434855),
 DecisionTreeRegressor(max_features=1.0, random_state=1819583497),
 DecisionTreeRegressor(max_features=1.0, random_state=530702035),
 DecisionTreeRegressor(max_features=1.0, random_state=626610453),
 DecisionTreeRegressor(max_features=1.0, random_state=1650906866),
 DecisionTreeRegressor(max_features=1.0, random_state=1879422756),
 DecisionTreeRegressor(max_features=1.0, random_state=1277901399),
 DecisionTreeRegressor(max_features=1.0, random_state=1682652230),
 D

In [14]:
pred_y = [est.predict(test_X.values) for est in model.estimators_]
proba_preds = np.array(pred_y)
proba_preds.shape  # (num_predictors, num_samples)

(300, 5360)

In [18]:
proba_preds[0]

array([ 9.70800018, 35.9469986 , 15.125     , ...,  3.86899996,
        4.99499989, 35.17399979])

In [19]:
quantiles = [0.025,0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,
             0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,0.975]
print(quantiles, len(quantiles))

quantile_predictions = {"id": test_df.id.values}
quantile_predictions.update({q: None for q in quantiles})
for q in quantiles:
    quantile_predictions[q] = np.quantile(proba_preds, q, axis=0)
submission_df = pd.DataFrame.from_dict(quantile_predictions)
submission_df.head(2)

[0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975] 21


Unnamed: 0,id,0.025,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,...,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.975
0,64320,4.573,4.643,5.065,5.135,5.135,5.276,5.487,5.909,6.092,...,7.668,8.301,8.512,9.708,10.693,20.823,20.823,21.878,38.701552,39.887001
1,64321,2.532,2.532,5.065,7.0945,7.527,7.879,7.949,8.09,8.512,...,9.5144,10.13,10.13,10.271,13.788,15.3356,16.672001,34.400002,35.946999,42.278999
