In [None]:
!pip install numpy scikit-learn matplotlib lightgbm pandas seaborn catboost dill streamlit

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.multioutput import MultiOutputRegressor           # https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputRegressor.html
from sklearn.ensemble import ExtraTreesRegressor               # https://scikit-learn.org/stable/modules/generated/sklearn.tree.ExtraTreeRegressor.html
from sklearn.ensemble import RandomForestRegressor   
#from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor



from matplotlib import style
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.rcParams.update({'font.size': 20})

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pathlib import Path

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Hyperparameters and RMSE function for estimators

In [2]:
def rmse(y_test, y_pred):
    return np.sqrt(np.average((y_test - y_pred) ** 2, axis=0))

#def split_scale_data(X, Y, test_size=0.2, random_state=42):
    #X = features.values

    #X_train, X_test, Y_train, Y_test = train_test_split(X, test_size=test_size, random_state=random_state)

    # Scale and transform the data since models (like SVR) can be sensitive to disparate data values
    #scaler = StandardScaler()

    # Fit on the training data and transform test data based on this
    # See https://datascience.stackexchange.com/questions/38395/standardscaler-before-and-after-splitting-data
    #X_train = scaler.fit_transform(X_train)
    #X_test = scaler.transform (X_test)

    #return X_train, X_test, Y_train, Y

params = {'extra_trees': {'n_estimators': 1000,
                          'max_features': "sqrt",     # Out of 20000
                          'max_depth': 5,
                          'random_state': 42
                          },
            'xgboost': {'max_depth': 5,
                        'n_estimators': 1000,
                        'learning_rate': 0.03,
                        'random_state': 42
                        },
            'lightgbm': {'task': 'train', 
                         'boosting': 'gbdt',
                         'objective': 'regression',
                         'metric': 'rmse',
                         'num_boost_round':300,
                         'learning_rate': 0.05,
                         'max_depth': 15,
                         'num_leaves': 32,
                         'min_data_in_leaf': 200
                         },
            'catboost': {'learning_rate': 0.3, 
                      'depth': 6, 
                      'l2_leaf_reg': 3, 
                      'loss_function': 'MultiRMSE', 
                      'eval_metric': 'MultiRMSE', 
                      'task_type': 'CPU', 
                      'iterations': 150,
                      'od_type': 'Iter', 
                      'boosting_type': 'Plain', 
                      'bootstrap_type': 'Bernoulli', 
                      'allow_const_label': True,
                      },
          'mlp': {'learning_rate': 'adaptive',
                  'random_state': 42,
                  'early_stopping': True,
                  'max_iter': 500,
                  'verbose': 1,
                  'hidden_layer_sizes': (100, 50, 50)
                 },
          'gbsk':{},
          'histgbsk': {},
          'random_forest': {}
}

ESTIMATORS = {
    'random_forest': RandomForestRegressor(**params["random_forest"]),
    'extra_trees': ExtraTreesRegressor(**params["extra_trees"]),
    #'xgboost': MultiOutputRegressor(XGBRegressor(**params["xgboost"])),
    # 'lightgbm': MultiOutputRegressor(LGBMRegressor(**params["lightgbm"])),
    'catboost': CatBoostRegressor(**params["catboost"]),
    'mlp': MLPRegressor(**params["mlp"]),
    'gbsk': MultiOutputRegressor(GradientBoostingRegressor(**params["gbsk"])),
    'histgbsk': MultiOutputRegressor(HistGradientBoostingRegressor(**params["histgbsk"])),
}

Loading files

In [3]:
main_path = Path().absolute()

X = np.load(main_path / "Data" / "Clean" / "X.npy")
Y = np.load(main_path / "Data" / "Clean" / "Y.npy")

Shape check

In [4]:
print(f"The shape of the input date is: {X.shape}") # this should (400, 2)
print(f"The shape of the input date is: {Y.shape}") # this should be (400, 10201)

The shape of the input date is: (400, 2)
The shape of the input date is: (400, 10201)


Loop for testing multiple estimators at once

In [5]:
for name, estimator in ESTIMATORS.items():
   print("Running with %s"%name)
   X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)
   estimator.fit(X_train, Y_train)                    # fit() with instantiated object
   Y_predicted = estimator.predict(X_test)   # Make predictions and save it in dict under key: name
   Y_rmse = rmse(Y_test, Y_predicted)
   print(f"The R2 score for {name} is: {estimator.score(X_test, Y_test)}")

Running with catboost
0:	learn: 22.2219882	total: 5.86s	remaining: 14m 33s
1:	learn: 20.8891676	total: 11.3s	remaining: 13m 53s
2:	learn: 19.8811559	total: 16.4s	remaining: 13m 24s
3:	learn: 19.2080982	total: 21.6s	remaining: 13m 8s
4:	learn: 18.6959494	total: 26.6s	remaining: 12m 52s
5:	learn: 18.2931266	total: 31.9s	remaining: 12m 44s
6:	learn: 17.9654196	total: 37.1s	remaining: 12m 38s
7:	learn: 17.7268626	total: 42.2s	remaining: 12m 29s
8:	learn: 17.6087325	total: 47.4s	remaining: 12m 22s
9:	learn: 17.3884549	total: 52.8s	remaining: 12m 18s
10:	learn: 17.2463997	total: 57.9s	remaining: 12m 11s
11:	learn: 17.1250769	total: 1m 3s	remaining: 12m 4s
12:	learn: 16.9717893	total: 1m 8s	remaining: 11m 59s
13:	learn: 16.7971680	total: 1m 13s	remaining: 11m 54s
14:	learn: 16.6937506	total: 1m 18s	remaining: 11m 45s
15:	learn: 16.5397389	total: 1m 23s	remaining: 11m 37s
16:	learn: 16.4244702	total: 1m 28s	remaining: 11m 30s
17:	learn: 16.2888457	total: 1m 33s	remaining: 11m 23s
18:	learn: 16

KeyboardInterrupt: 

Testing 

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)
# estimator = MultiOutputRegressor(GradientBoostingRegressor(**params["gbsk"]))
estimator = MultiOutputRegressor(RandomForestRegressor(**params["random_forest"]))
estimator.fit(X_train, Y_train)                    # fit() with instantiated object
Y_predicted = estimator.predict(X_test)   # Make predictions and save it in dict under key: name
Y_rmse = rmse(Y_test, Y_predicted)
print(f"The R2 score for the GradientBoostingRegressor is: {estimator.score(X_test, Y_test)}")

The R2 score for the GradientBoostingRegressor is: 0.4293969612845138


In [ ]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)
# estimator = MultiOutputRegressor(GradientBoostingRegressor(**params["gbsk"]))
estimator = RandomForestRegressor(**params["random_forest"])
estimator.fit(X_train, Y_train)                    # fit() with instantiated object
Y_predicted = estimator.predict(X_test)   # Make predictions and save it in dict under key: name
Y_rmse = rmse(Y_test, Y_predicted)
print(f"The R2 score for the GradientBoostingRegressor is: {estimator.score(X_test, Y_test)}")

Finding new energy based on position

In [None]:
new_position = 1.5, 0.5
new_Energy = estimator.predict(new_position)
np.savetxt("new_E.csv", new_Energy, delimiter=",")

Writing model

In [None]:
import dill
dill.dump(estimator, open(main_path / 'model.dill', 'wb'))


Importing dilled model

In [None]:
def load_models(model_path):
    print(f"Loading model")
    mlmodel_path = model_path / "model.dill"
    with open(mlmodel_path, 'rb') as f:
        model = dill.load(f)
    return model