In [224]:
# Helper packages
import numpy as np
import pandas as pd
from plotnine import *
from scipy.stats import uniform
from scipy.stats import randint

# Modeling packages
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
# from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import partial_dependence
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder


In [225]:
df = pd.read_csv("data/stellar_eda.csv")

In [226]:
df['class'].value_counts()

GALAXY    47556
STAR      17274
QSO       15169
Name: class, dtype: int64

In [227]:
#recode response variable to 0/1/2
df = df.replace({"class": {"STAR": 0, "GALAXY": 1, "QSO": 2}})

In [228]:
# create train/test split
# train, test = train_test_split(df, train_size=0.7, random_state=123)

x = df.drop(['class'], axis = 1)
y = df.loc[:,'class'].values

In [229]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 123)

In [230]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [231]:
# create random forest estimator
rf = RandomForestRegressor(random_state = 42)

# rf_mod = RandomForestRegressor(n_estimators=1000)

In [232]:
print('Parameters currently in use:\n')
print(rf.get_params())

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [233]:
rf.fit(x_train,y_train)

RandomForestRegressor(random_state=42)

In [234]:
y_predit = rf.predict(x_test)
y_predit

array([1.  , 1.05, 1.  , ..., 1.01, 1.  , 0.  ])

In [235]:
score = rf.score(x_test, y_test)
score

0.9560496201992569

In [236]:
# create random forest estimator
rf_mod = RandomForestRegressor()



In [237]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

In [238]:
# Number of features to consider at every split
max_features = ['auto', 'sqrt']

In [239]:
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

In [240]:
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

In [241]:
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

In [242]:
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [243]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [244]:
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [245]:
rf_tun = RandomForestRegressor()

In [246]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_tun, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)


In [None]:
# Fit the random search model
rf_random.fit(x_train,y_train)

In [None]:
rf_random.best_params_