## Data file:
 * https://raw.githubusercontent.com/vjavaly/Baruch-CIS-STA-3920/main/data/temps_extended.csv

In [1]:
from datetime import datetime
print(f'Run time: {datetime.now().strftime("%D %T")}')

Run time: 12/01/23 02:55:30


### Import libraries

In [2]:
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

### Function to evaluate model accuracy


In [3]:
def evaluate(model, model_string, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance: {}'.format(model_string))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

## Part 1

### Load data

In [4]:
df=pd.read_csv("https://raw.githubusercontent.com/vjavaly/Baruch-CIS-STA-3920/main/data/temps_extended.csv")

### Examine data

In [5]:
#display first five rows
df.head()

Unnamed: 0,year,month,day,weekday,ws_1,prcp_1,snwd_1,temp_2,temp_1,average,actual,friend
0,2011,1,1,Sat,4.92,0.0,0,36,37,45.6,40,40
1,2011,1,2,Sun,5.37,0.0,0,37,40,45.7,39,50
2,2011,1,3,Mon,6.26,0.0,0,40,39,45.8,42,42
3,2011,1,4,Tues,5.59,0.0,0,39,42,45.9,38,59
4,2011,1,5,Wed,3.8,0.03,0,42,38,46.0,45,39


In [6]:
#shape of data (rows+colums)
df.shape

(2191, 12)

### Clean up data

In [7]:
# Drop unnecessary columns: year, month, day, weekday
df.drop(['year', 'month', 'day', 'weekday'], axis=1, inplace=True)

In [8]:
# Display first few rows of updated dataframe
df.head()

Unnamed: 0,ws_1,prcp_1,snwd_1,temp_2,temp_1,average,actual,friend
0,4.92,0.0,0,36,37,45.6,40,40
1,5.37,0.0,0,37,40,45.7,39,50
2,6.26,0.0,0,40,39,45.8,42,42
3,5.59,0.0,0,39,42,45.9,38,59
4,3.8,0.03,0,42,38,46.0,45,39


### Separate independent variables and dependent variable
* Independent variables: all remaining variables except 'actual'
* Dependent variable: 'actual'

In [9]:
X = df.drop("actual", axis = 1)
y = df["actual"]

### Split into training and test sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Instantiate the RandomForestRegressor model

In [11]:
rf = RandomForestRegressor(random_state=42)

### Print RandomForestRegressor default hyperparameters

In [12]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

### Fit RandomForestRegressor model using the default hyperparameters

In [13]:
rf.fit(X_train, y_train)

### Print accuracy for RandomForestRegressor model using the default hyperparameters
#### NOTE: Use "evaluate" function defined at top of this notebook.
For example, assuming the following variable values:
* model = rf
* model_string = 'using default hyperparameters'
* test_features = X_test
* test_labels = y_test

rfr_base_accuracy = evaluate(rf, 'With default hyperparameters', X_test, y_test)

In [14]:
default_accuracy = evaluate(rf, 'Using default hyperparameters', X_test, y_test)

Model Performance: Using default hyperparameters
Accuracy = 93.58%.


## Part 2

### NOTE: The objective of the hyperparameter search is to improve model performance above the default hyperparameters

### Prepare variables for hyperparameter search
* Using sklearn.ensemble.RandomForestRegressor documentation [https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html] choose at least 3 hyperparameters for random search
* For each hyperparameter selected, set up an array of values
  * For example: max_features = ['log2', 'sqrt']

In [15]:
n_estimators = [100, 200, 300, 500]
max_depth = [10, 20, 30, None]
max_features = ['sqrt', 'log2']

### Create the hyperparameter grid for the random search
Use the variables prepared above

In [16]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'max_features': ['sqrt', 'log2'],
}

### Print the hyperparameter grid for the random search

In [17]:
param_grid

{'n_estimators': [100, 200, 300, 500],
 'max_depth': [10, 20, 30, None],
 'max_features': ['sqrt', 'log2']}

### Set up random search with k-fold cross validation using the hyperparameter grid

In [18]:
# Random search of hyperparameters using 5 fold cross validation.
rf2 = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 32, cv = 5, random_state=42, n_jobs = -1)

### Fit the random search model
Be patient, this might take a minute or longer

In [19]:
rf2.fit(X_train, y_train)

### Print the best hyperparameters found by the random search

In [20]:
best_hyperparameters = rf2.best_params_
print(best_hyperparameters)

{'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': 10}


### Print best random search model accuracy
#### NOTE: Use "evaluate" function defined at top of this notebook.

In [21]:
best_rf = rf2.best_estimator_
best_rf_accuracy = evaluate(best_rf, 'Best Random Search Model', X_test, y_test)
print(best_rf_accuracy)

Model Performance: Best Random Search Model
Accuracy = 93.71%.
93.7137279027883
