In [47]:
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import Lasso, ElasticNet, LinearRegression, Ridge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE, SelectKBest, f_regression, chi2
from sklearn import decomposition
from numpy.random import lognormal
import matplotlib.pyplot as plt
from scipy.stats import lognorm
import sklearn
import seaborn as sns
import csv
import shap
import random
import pickle
import os
from textwrap import wrap
from pathlib import Path

# Pandas and Numpy Options
cwd = os.getcwd()
pd.set_option('display.max_rows', 300, 'display.max_columns', 100)
pd.set_option('use_inf_as_na', True)
np.seterr(divide='ignore', invalid='ignore')

# Set Random Seed
seed_numbers = list(range(1, 1000))
random.Random(1337).shuffle(seed_numbers)

print("correlations are removed (80%+ Pearson Correlated Features Removed) \n"
      "and the instances are stratified to ensure each cross-validation fold \n"
      "has an even distribution of the predictor \n"
      "prior to the use of the model")

correlations are removed (80%+ Pearson Correlated Features Removed) 
and the instances are stratified to ensure each cross-validation fold 
has an even distribution of the predictor 
prior to the use of the model


In [48]:
# import the data set
data_set_file_path = cwd + '/subset_1_all_literature_high_cardinal_removed_ml_data_set.csv'
data_set = pd.read_csv(data_set_file_path)
# Remove all Experiments with Anomalous Mg values
data_set = data_set[~data_set['Paper Number'].isin(['8', '46', '48', '59', '71', '85', '96', '98'])]

new_data_set = pd.DataFrame()

In [49]:
# which rep is performed
rep = 1
number_of_splits = 3
seed_number = seed_numbers[rep]

experiment_name = "Extra_Trees_RFE_3CV_Stratified_Baseline"
scoring = "r2"

# Actual Machine Learning Script for Regression Problem
ETR = ExtraTreesRegressor(random_state=seed_number, n_jobs=-1)

# Parameter Grid dictionary
parameters = {}
# Parameter Grid for ETR / RFR
parameters.update({'estimator__n_estimators': [10, 100, 200, 500]})
parameters.update({'estimator__max_depth': [None, 1, 2, 3, 4, 5]})

# Parameter Grid for RFE
parameters.update({"rfe__n_features_to_select": [1, 5, 10, 20, 25]})

# # Recursive Elimination for Regression Models
estimator = ExtraTreesRegressor(random_state=seed_number, n_jobs=-1)
recurse = RFE(estimator=estimator, step=0.1)  # Set step to 0-1 for percentage of features removed per iter

# One-hot pipeline added
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median', missing_values=np.NaN)),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.NaN)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Stored lists of the numeric and categorical columns using the pandas dtype method.
numeric_features = new_data_set.select_dtypes(include=['int64', 'float64']).columns
categorical_features = new_data_set.select_dtypes(include=['object']).columns

# Column transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Pipeline is called with model
rf = Pipeline(steps=[('preprocessor', preprocessor),
                     ('rfe', recurse),
                     ('estimator', ETR)])

inner_cv = KFold(n_splits=number_of_splits, shuffle=True, random_state=1)
est_used = GridSearchCV(estimator=rf, param_grid=parameters, cv=inner_cv, scoring=scoring)

In [50]:
est_used

# note: just take a screen shot and edit the GridSearchCV to state estimator = ExtraTreesRegressor; cv = KFold (cut and add this from above)