In [None]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
x_train = pd.read_csv('/content/drive/MyDrive/weather_data/X_station_coord_2nn_imputed_by_day.csv')
y_train = x_train['ground_truth']
del x_train['ground_truth']
del x_train['date']
del x_train['timestamp']
del x_train['number_sta']
del x_train['Id']

print(x_train.shape)


x_test = pd.read_csv('/content/drive/MyDrive/weather_data/X_test_coord_2nn_imputed_by_day.csv')
test_ids = x_test['Id']
del x_test['Id']
del x_test['number_sta']
print(x_test.shape)

seed = 1998

(172374, 10)
(96055, 10)


## Decision tree

In [None]:
def cross_validation_decision_tree(x, y, cv_type, n_random_test=10):
    """
    :param x_train:
    :param y_train:
    :param cv_type: must be 'random' or 'gridsearch'
    :param n_random_test: nombre de jeux de paramètres à tester
    :return:
    """
    params_to_tune = {'max_depth' : [5, 10, 20, 50],
                      'max_features' : [4, 6, 8, 10, np.sqrt(x.shape[1])],
                      'min_samples_leaf' : [10, 100, 500, 1000]
                      }
    # split the dataset by keeping 10% of the dataset for train and 5% for testing during the CV
    x_cv, _, y_cv, _ = train_test_split(x, y, train_size=0.15, random_state=42)

    dt = DecisionTreeRegressor(random_state=seed)

    # CV if random search
    if cv_type == 'random':
        cv = RandomizedSearchCV(dt, params_to_tune, n_iter=n_random_test, random_state=0, scoring='neg_mean_absolute_percentage_error', verbose=2)
        result = cv.fit(x_cv, y_cv)
        print(result.best_params_)
        return result.best_params_
    # CV if grid search
    if cv_type == 'gridsearch':
        cv = GridSearchCV(dt, params_to_tune, scoring='neg_mean_absolute_percentage_error')
        result = cv.fit(x_cv, y_cv)
        print(result.best_params_)
        return result.best_params_

    print("Argument cv_type was incorrect, default parameters are returned")
    return {'max_depth' : 10, 'max_features' : None, 'min_samples_leaf' : 1}


def decision_tree(x_train, y_train, x_test, cross_validation = True, cv_type='random', n_random_test=15):
    if cross_validation:
        params = cross_validation_decision_tree(x_train, y_train, cv_type, n_random_test)
    else:
        # corresponds to default values except for max_depth (pruning) to avoid to much overfitting
        params = {'max_depth' : 100,
                  'max_features' : None,
                  'min_samples_leaf' : 1
                  }
    dt = DecisionTreeRegressor(max_depth=params['max_depth'], max_features=params['max_features'],
                                min_samples_leaf=params['min_samples_leaf'], random_state=seed)
    dt.fit(x_train, y_train)
    return dt.predict(x_test)

In [None]:
x_train_train, x_train_test, y_train_train, y_train_test = train_test_split(x_train, y_train, train_size=0.75, random_state=42)

y_predict = decision_tree(x_train_train, y_train_train, x_train_test, cross_validation=True, cv_type='gridsearch')

print(y_train_test)
print(y_predict)

print(mean_absolute_percentage_error(y_true=y_train_test, y_pred=y_predict))

80 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 1320, in fit
    X_idx_sorted=X_idx_sorted,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 308, in fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

 -2.68856640e+15 -2.69206621e+15 -2.69624752e+15 -2.70487516e+15
 -2.68118779e+15 -2.688

{'max_depth': 50, 'max_features': 10, 'min_samples_leaf': 10}
20017     0.2
127292    0.0
15827     0.0
83173     8.5
117058    1.8
         ... 
32680     3.2
2934      2.4
9965      0.2
46704     0.0
56427     0.0
Name: ground_truth, Length: 43094, dtype: float64
[0.84615385 1.2        0.43333333 ... 1.09090909 2.70909091 1.68125   ]
1976667417824640.2


In [None]:
y_predict = decision_tree(x_train, y_train, x_test, cross_validation=True, cv_type='gridsearch')
# print(mean_absolute_percentage_error(y_true=y_test, y_pred=y_predict))

80 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 1320, in fit
    X_idx_sorted=X_idx_sorted,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 308, in fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

 -2.78879055e+15 -2.80561421e+15 -2.82518328e+15 -2.81991035e+15
 -2.78873787e+15 -2.781

In [None]:
dataframe = pd.DataFrame()
dataframe['Id'] = test_ids 
print(y_predict)
dataframe['Prediction'] = y_predict + 1
print(dataframe['Prediction'])

dataframe.to_csv('submission_dt2.csv', index = False, header=True)
print(dataframe[dataframe["Id"]=="14066001_149"])

[16.75        0.84        1.32222222 ...  0.72330383  1.54666667
  0.29333333]
0        17.750000
1         1.840000
2         2.322222
3         3.480000
4        12.018182
           ...    
96050     1.083887
96051     1.670000
96052     1.723304
96053     2.546667
96054     1.293333
Name: Prediction, Length: 96055, dtype: float64
              Id  Prediction
66  14066001_149       10.43


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import numpy as np
seed = 1998

def cross_validation_random_forest(x, y, cv_type, n_random_test=10):
    """
    :param x_train:
    :param y_train:
    :param cv_type: must be 'random' or 'gridsearch'
    :param n_random_test: nombre de jeux de paramètres à tester
    :return:
    """
    params_to_tune = {'n_estimators': [50, 100, 200],
                      'max_depth' : [5, 10, 20, 50],
                      'max_features' : [4, 6, 8, 10, np.sqrt(x.shape[1])],
                      'min_samples_leaf' : [10, 100, 500, 1000]
                      }
    # split the dataset by keeping 10% of the dataset for train and 5% for testing during the CV
    rf = RandomForestRegressor(random_state=seed)

    # CV if random search
    if cv_type == 'random':
        cv = RandomizedSearchCV(rf, params_to_tune, n_iter=n_random_test, random_state=0, scoring='neg_mean_absolute_percentage_error', verbose=2)
        result = cv.fit(x, y)
        print(result.best_params_)
        return result.best_params_
    # CV if grid search
    if cv_type == 'gridsearch':
        cv = GridSearchCV(rf, params_to_tune, scoring='neg_mean_absolute_percentage_error', verbose=2)
        result = cv.fit(x, y)
        print(result.best_params_)
        return result.best_params_

    print("Argument cv_type was incorrect, default parameters are returned")
    return {'n_estimators':100, 'max_depth' : 50, 'max_features' : None, 'min_samples_leaf' : 1}

def random_forest(x_train, y_train, x_test, cross_validation = True, cv_type='random', n_random_test=15):
    if cross_validation:
        params = cross_validation_random_forest(x_train, y_train, cv_type, n_random_test)
    else:
        # corresponds to default values except for max_depth (pruning) to avoid to much overfitting
        params = {'n_estimators':100,
                  'max_depth' : 50,
                  'max_features' : None,
                  'min_samples_leaf' : 1
                  }
    dt = RandomForestRegressor(n_estimators=params['n_estimators'], max_depth=params['max_depth'], max_features=params['max_features'],
                               min_samples_leaf=params['min_samples_leaf'], random_state=seed)
    dt.fit(x_train, y_train)
    return dt.predict(x_test)

In [None]:
y_predict = random_forest(x_train, y_train, x_test, cross_validation=True, cv_type='random', n_random_test=15)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] END max_depth=10, max_features=3.1622776601683795, min_samples_leaf=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, max_features=3.1622776601683795, min_samples_leaf=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, max_features=3.1622776601683795, min_samples_leaf=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, max_features=3.1622776601683795, min_samples_leaf=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, max_features=3.1622776601683795, min_samples_leaf=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, max_features=4, min_samples_leaf=1000, n_estimators=200; total time=  50.6s
[CV] END max_depth=10, max_features=4, min_samples_leaf=1000, n_estimators=200; total time=  51.2s
[CV] END max_depth=10, max_features=4, min_samples_leaf=1000, n_estimators=200; total time=  50.1s
[CV] END max_depth=10, max_features=4, min_samples_leaf=1000, n_estimato

15 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 459, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

{'n_estimators': 50, 'min_samples_leaf': 10, 'max_features': 10, 'max_depth': 20}


In [None]:
dataframe = pd.DataFrame()
dataframe['Id'] = test_ids 
print(y_predict)
dataframe['Prediction'] = y_predict + 1
print(dataframe['Prediction'])

dataframe.to_csv('submission_rd.csv', index = False, header=True)

[9.87213426 2.20271821 4.46689356 ... 1.2782421  2.46053906 0.34841677]
0        10.872134
1         3.202718
2         5.466894
3         3.473030
4        11.362482
           ...    
96050     1.847725
96051     2.920088
96052     2.278242
96053     3.460539
96054     1.348417
Name: Prediction, Length: 96055, dtype: float64
