In [15]:
# import libraries
import numpy as np
import pandas as pd
from plotnine import *
from scipy.stats import uniform
from scipy.stats import randint

# Modeling packages
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import partial_dependence
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [16]:
#Loading the dataset
df = pd.read_csv("data/stellar_eda.csv")

In [17]:
df['class'].value_counts()

GALAXY    47556
STAR      17274
QSO       15169
Name: class, dtype: int64

In [18]:
df['fiber_ID'] = df['fiber_ID'].astype(str)
df['spec_obj_ID'] = df['spec_obj_ID'].astype(str)
df['field_ID'] = df['field_ID'].astype(str)
df['rerun_ID'] = df['rerun_ID'].astype(str)
df['run_ID'] = df['run_ID'].astype(str)
df['obj_ID'] = df['obj_ID'].astype(str)
df['plate'] = df['plate'].astype(str)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79999 entries, 0 to 79998
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   obj_ID       79999 non-null  object 
 1   alpha        79999 non-null  float64
 2   delta        79999 non-null  float64
 3   u            79999 non-null  float64
 4   g            79999 non-null  float64
 5   r            79999 non-null  float64
 6   i            79999 non-null  float64
 7   z            79999 non-null  float64
 8   run_ID       79999 non-null  object 
 9   rerun_ID     79999 non-null  object 
 10  cam_col      79999 non-null  int64  
 11  field_ID     79999 non-null  object 
 12  spec_obj_ID  79999 non-null  object 
 13  class        79999 non-null  object 
 14  redshift     79999 non-null  float64
 15  plate        79999 non-null  object 
 16  MJD          79999 non-null  int64  
 17  fiber_ID     79999 non-null  object 
dtypes: float64(8), int64(2), object(8)
memory usag

Preparing the Random Forest Model

In [20]:
#handling categorical features
#https://stackoverflow.com/questions/24715230/can-sklearn-random-forest-directly-handle-categorical-features

le=preprocessing.LabelEncoder()
df['class']=le.fit_transform(df['class']).astype('str')

In [21]:
#List of all numarical columns that are going to be used in our modelling process
column_list = ['u','g','r','i','z','redshift', 'alpha', 'delta', 'MJD']

In [22]:
#defning the input and the output
x = df[column_list]
y = df["class"]

In [23]:
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 123)

In [24]:
# Numarical Inputs Standardization
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [25]:
# Create Random Forest Estimator 
rf = RandomForestRegressor()

In [26]:
# Train The Random Forest
rf.fit(x_train,y_train)

RandomForestRegressor()

In [27]:
# Predict the response for test dataset
y_predit = rf.predict(x_test)
y_predit

array([0.  , 0.06, 0.  , ..., 0.  , 0.  , 2.  ])

In [28]:
# Model Accuracy Score Before tuning
score = rf.score(x_test, y_test)
score

0.9683261765813398

Performing Random Hyperparameter Grid

In [306]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

In [307]:
# Number of features to consider at every split
max_features = ['auto', 'sqrt']

In [308]:
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

In [309]:
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

In [310]:
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

In [311]:
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [312]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)


In [315]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)


In [316]:
# Fit the random search model
rf_random.fit(x_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time= 1.2min
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time= 1.2min
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time= 1.2min
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time= 4.7min
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time= 4.3min
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time= 4.5min
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000;

RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [317]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 70,
 'bootstrap': True}

In [350]:
# Create Random Forest Estimator, specifying the hyperparameters after Random Search
rf1 = RandomForestRegressor(n_estimators = 400, random_state = 42)

In [351]:
# Train Random Forest
rf1.fit(x_train,y_train)

RandomForestRegressor(n_estimators=400, random_state=42)

In [352]:
#Predict the response for test dataset
y_predit1 = rf1.predict(x_test)
y_predit1

array([0.    , 0.0575, 0.0025, ..., 0.    , 0.    , 2.    ])

In [354]:
#Accuracy score after tuning 
score1 = rf1.score(x_test, y_test)
score1

0.9684858275773417

In [5]:
#Loading the test dataset
df1 = pd.read_csv("/Users/rawanalmobarak/Desktop/Misk_DSI/Stellar/stel/data/test_set/stellar_test.csv")

In [7]:
#List of all numarical columns that are going to be used in our modelling process
column_list = ['u','g','r','i','z','redshift', 'alpha', 'delta', 'MJD']
x1 = df1[column_list]
y1 = df1["class"]

In [13]:
# Numarical Inputs Standardization
sc = StandardScaler()
x1 = sc.fit_transform(x1)




In [29]:
#Predict the test set
y_final = rf1.predict(x1)
y_final

array([0.  , 0.  , 0.1 , ..., 1.  , 0.11, 0.94])

In [30]:
score_final = rf1.score(x_test, y_test)
score_final

0.9683261765813398