In [1]:
#%pip install lightgbm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV




X_train= pd.read_csv("../train-test/X_train_medium.csv")
X_test= pd.read_csv("../train-test/X_test_medium.csv")
y_train= pd.read_csv("../train-test/y_train.csv")
y_test= pd.read_csv("../train-test/y_test.csv")


In [2]:
# Number of trees in random forest
n_estimators = range(10,2000,50)
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = range(10,500,10)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,30,40]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': range(10, 2000, 50), 'max_features': ['auto', 'sqrt'], 'max_depth': range(10, 500, 10), 'min_samples_split': [2, 5, 10, 30, 40], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [3]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train.values.ravel())
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 360,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 430,
 'bootstrap': False}

In [4]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.\n'.format(accuracy))
    
    return accuracy

base_model = RandomForestRegressor(n_estimators = 10, n_jobs=-1)
base_model.fit(X_train, y_train)
print(base_model.score(X_test, y_test.values.ravel()))
base_accuracy = evaluate(base_model, X_test, y_test.values.ravel())

best_random = rf_random.best_estimator_
print(best_random.score(X_test, y_test.values.ravel()))
random_accuracy = evaluate(best_random, X_test, y_test.values.ravel())

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))


  base_model.fit(X_train, y_train)


0.874543157712729
Model Performance
Average Error: 0.1008 degrees.
Accuracy = 99.16%.

0.9048140403543766
Model Performance
Average Error: 0.0867 degrees.
Accuracy = 99.28%.

Improvement of 0.12%.


In [5]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20,30,80, 90, 100],
    'max_features': [2, 3,15,30,50,138],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [2,5,8, 10],
    'n_estimators': [800, 900, 1000, 1300]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train.values.ravel())
print((grid_search.best_params_))

best_grid = grid_search.best_estimator_
print(best_grid.score(X_test, y_test.values.ravel()))

grid_accuracy = evaluate(best_grid, X_test, y_test.values.ravel())

print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))



Fitting 3 folds for each of 1440 candidates, totalling 4320 fits
{'bootstrap': True, 'max_depth': 30, 'max_features': 15, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 1000}
0.902046463059125
Model Performance
Average Error: 0.0885 degrees.
Accuracy = 99.26%.

Improvement of 0.10%.


Fitting 3 folds for each of 1440 candidates, totalling 4320 fits

{'bootstrap': True, 'max_depth': 20, 'max_features': 15, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 1000}

0.902342771165578

Model Performance
Average Error: 0.0880 degrees.
Accuracy = 99.26%.

Improvement of 0.11%.

In [6]:
from ipynb.fs.defs.data_analysis import save_regressor
rf= RandomForestRegressor(bootstrap = True, max_depth = 80, max_features = 15, min_samples_leaf = 3, min_samples_split = 8, n_estimators = 1300)
save_regressor(rf)

In [7]:
from ipynb.fs.defs.data_analysis import model_analysis
data = model_analysis(rf)

enter
                       Score      Variance           MSE
name                                                    
DecisionTreeRegressor  0.842  5.754077e+09  1.187411e+09
KNeighborsRegressor    0.817  5.671486e+09  1.402773e+09
LGBMRegressor          0.883  5.837807e+09  8.596156e+08
LinearRegression       0.846  5.482299e+09  1.372148e+09
RandomForestRegressor  0.891  5.651420e+09  8.100146e+08
                       Score      Variance           MSE
name                                                    
DecisionTreeRegressor  0.830  5.920922e+09  1.283516e+09
KNeighborsRegressor    0.291  4.999458e+09  4.721170e+09
LGBMRegressor          0.902  5.810056e+09  7.237563e+08
LinearRegression       0.846  5.481724e+09  1.456663e+09
RandomForestRegressor  0.902  5.527662e+09  7.257831e+08
                       Score      Variance           MSE
name                                                    
DecisionTreeRegressor  0.822  5.812806e+09  1.286819e+09
KNeighborsRegressor    0.

In [8]:
data