## Randon Forest Regression

In [2]:
# imports
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder


In [3]:
# Read the Dataframe here
#load data
jan_2019 = pd.read_csv('../Kyle_eda/data/01_2019_flights_weather.csv', low_memory=False)
# drop rows with nan in arr_delay
jan_2019.dropna(inplace=True,subset=['arr_delay'])

In [4]:
# encode mkt_unique_carrier
enc = OneHotEncoder()
enc.fit(jan_2019['mkt_unique_carrier'].values.reshape(-1,1))
enc_df = pd.DataFrame(enc.transform(jan_2019['mkt_unique_carrier'].values.reshape(-1,1)).toarray())

enc_df.reset_index(drop=True, inplace=True)
jan_2019.reset_index(drop=True, inplace=True)
jan_2019 = pd.concat([jan_2019, enc_df], axis=1,)

In [5]:
# get mapping dictionary from file to use for encoding origin and dest
dest_dict = pd.read_csv('../Kyle_eda/data/dest_dict.csv', index_col=0)
origin_dict = pd.read_csv('../Kyle_eda/data/origin_dict.csv', index_col=0)
dest_dict = dest_dict.T.to_dict('records')[0]
origin_dict = origin_dict.T.to_dict('records')[0]

In [6]:
# encode origin and dest
jan_2019['dest'] = jan_2019['dest'].map(dest_dict)
jan_2019['origin'] = jan_2019['origin'].map(origin_dict)


In [9]:
# mask for columns to keep
cols_to_keep = ['origin', 
                'dest', 
                'crs_elapsed_time',
                'distance',
                'crs_dep_time_hr',
                'crs_arr_time_hr',
                'precip_origin',
                'windspeedKmph_origin',
                'winddirDegree_origin',
                'visibility_origin',
                'DewPointC_origin',
                'pressure_origin',
                'cloudcover_origin',
                'WindGustKmph_origin',
                'humidity_origin',
                'tempC_origin',
                'precip_dest',
                'windspeedKmph_dest',
                'winddirDegree_dest',
                'visibility_dest',
                'DewPointC_dest',
                'pressure_dest',
                'cloudcover_dest',
                'WindGustKmph_dest',
                'humidity_dest',
                'tempC_dest']

X = jan_2019[cols_to_keep]
y = jan_2019['arr_delay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
#Create the model
RFR_model=RandomForestRegressor(n_estimators=100, max_depth=10)

#Train the model 
RFR_model.fit(X_train,y_train)

RandomForestRegressor(max_depth=10)

In [11]:
# Accuracy check
training_preds = RFR_model.predict(X_train)
y_pred = RFR_model.predict(X_test)

# The mean squared error
print('Mean squared error: %.2f \n'
      % metrics.mean_squared_error(y_test, y_pred))


Mean squared error: 2415.83 



In [13]:

# The coefficient of determination:
print('Coefficient of determination: %.2f \n'
      % r2_score(y_test, y_pred))

Coefficient of determination: 0.14 



## Hypertuning & Grid

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

# Number of features to consider at every split
features = ['auto', 'sqrt']

# Maximum number of levels in tree
depth = [int(x) for x in np.linspace(10, 50, num = 11)]
depth.append(None)

# Minimum number of samples required to split a node
samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
samples_leaf = [1, 2, 4]

# Method of selecting samples
bootstrap_samples = [True, False]

# Create the random grid
grid_search = {'n_estimators': estimators,
               'max_features': features,
               'max_depth': depth,
               'min_samples_split': samples_split,
               'min_samples_leaf': samples_leaf,
               'bootstrap': bootstrap_samples}
print(grid_search)

In [None]:
# Random grid to search for best hyperparameters

# Base model to tune
rfr = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation,

# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rfr, param_distributions = grid_search, n_iter = 100, cv = 3, verbose=2)

# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
# Preds
y_pred = rf_random.predict(X_test)

In [None]:
# R2 Score
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test,y_pred)

print('rmse: ',rmse)
print('r2_score:', r2)