# Importing libraries

In [2]:
import numpy as np
import pandas as pd

# Setting rando seed to get reproducible runs
RSEED = 100

# Importing dataset

This section has been taken from Angelo's notebook demo_svr.ipynb

In [3]:
input_df = pd.read_csv("modifiedIndustriesPerCountyGA-2012.csv")
input_df = input_df.drop(['Column'], axis=1)

output_df = pd.read_csv("incidencerates.csv")
output_df = output_df[['locale', 'fips', 'annual_count_avg']]  # 'trend_last_5' missing data in 10 counties

In [4]:
input_df.head()

Unnamed: 0,GEO_TTL,COUNTY,YEAR,NAICS2012,NAICS2012_TTL,ESTAB,EMP,PAYANN,state,county
0,"Appling County, Georgia",1,2012,113,Forestry and logging,11,61,2256,13,1
1,"Appling County, Georgia",1,2012,115,Support activities for agriculture and forestry,1,0,0,13,1
2,"Appling County, Georgia",1,2012,221,Utilities,7,0,0,13,1
3,"Appling County, Georgia",1,2012,236,Construction of buildings,12,113,2572,13,1
4,"Appling County, Georgia",1,2012,237,Heavy and civil engineering construction,2,0,0,13,1


In [5]:
output_df.head()

Unnamed: 0,locale,fips,annual_count_avg
0,"Pickens County(7,8)",227,249
1,"Towns County(7,8)",281,116
2,"Union County(7,8)",291,226
3,"Lee County(7,8)",177,154
4,"Turner County(7,8)",287,56


# Dataset processing

This section has been taken from Angelo's notebook demo_svr.ipynb

In [6]:
industry_keys = {}

for naics in input_df['NAICS2012']:
    if naics not in industry_keys:
        industry_keys[naics] = 0

# industry_keys['year'] = 2012
industry_keys['counties'] = []
industry_keys['annual_count_avg'] = 0

for county in output_df['fips']:
    industry_keys['counties'].append(county)

In [9]:
#industry_keys

In [7]:
df = pd.DataFrame(industry_keys, index=industry_keys['counties'])
df.set_index('counties')

for idx, row in input_df.iterrows():
#     df.set_value(row['COUNTY'], row['NAICS2012'], row['ESTAB'])
    df.loc[row['COUNTY'], row['NAICS2012']] = row['ESTAB']

for idx, row in output_df.iterrows():
#     df.set_value(row['fips'], 'annual_count_avg', row['annual_count_avg'])
    df.loc[row['fips'], 'annual_count_avg'] = row['annual_count_avg']

In [10]:
df.head()

Unnamed: 0,113,115,221,236,237,238,321,325,327,331,...,525,533,114,483,316,487,211,521,counties,annual_count_avg
227,4,1,3,18,10,41,2,1,4,0,...,0,1,0,0,0,0,0,0,227,249
281,0,0,1,10,4,23,0,0,1,0,...,0,0,0,0,0,0,0,0,281,116
291,1,1,3,23,9,29,2,0,1,0,...,0,0,0,0,0,0,0,0,291,226
177,1,4,1,8,3,47,3,0,0,0,...,0,0,1,0,0,0,0,0,177,154
287,1,1,1,1,1,3,2,1,2,0,...,0,0,0,0,0,0,0,0,287,56


# Splitting Dataset into Training and testing Set

This section has partly been taken from Angelo's notebook demo_svr.ipynb

In [13]:
from sklearn.model_selection import train_test_split

X = df.loc[:, :'counties']
y = df['annual_count_avg']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state = RSEED)


In [14]:
X_train.shape

(111, 87)

In [15]:
X_test.shape

(48, 87)

# Creating a Random Forest Regression model

This section is partly inspired from:
https://github.com/WillKoehrsen/Machine-Learning-Projects/blob/master/Random%20Forest%20Tutorial.ipynb

In [28]:
from sklearn.ensemble import RandomForestRegressor

# Create the model with 10 trees
regressor = RandomForestRegressor(n_estimators = 100,
                                  random_state = RSEED)

# Fit on training data
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=100, verbose=0, warm_start=False)

Now, let's look at the average number of nodes and the average maximum depth

In [29]:
n_nodes = []
max_depths = []

for ind_tree in regressor.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 133
Average maximum depth 10


# Evaluating the regressor

This section is partly inspired from:
https://github.com/WillKoehrsen/Machine-Learning-Projects/blob/master/Random%20Forest%20Tutorial.ipynb

In [39]:
from sklearn import metrics
train_rf_predictions = regressor.predict(X_train)
rf_predictions = regressor.predict(X_test)
MAE_train = metrics.mean_absolute_error(y_train, train_rf_predictions)
RMSE_train = np.sqrt(metrics.mean_squared_error(y_train, train_rf_predictions))
MAPE_train = 100 * np.mean(abs(train_rf_predictions - y_train)/abs(y_train))
accuracy_train = 100 - MAPE_train
r2_train = metrics.r2_score(y_train, train_rf_predictions)

MAE_test = metrics.mean_absolute_error(y_test, rf_predictions)
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, rf_predictions))
MAPE_test = 100 * np.mean(abs(rf_predictions - y_test)/abs(y_test))
accuracy_test = 100 - MAPE_test
r2_test = metrics.r2_score(y_test, rf_predictions)
#print('Mean Absolute Error Train:', MAE_train)    
#print('Root Mean Squared Error Train:', RMSE_train)
#print('Mean Absolute Percentage Error Train:', MAPE_train)
print('Accuracy Train', str(accuracy_train) + "%")
print('r2_score Train:',r2_train)
print()
#print('Mean Absolute Error Test:', MAE_test)    
#print('Root Mean Squared Error Test:', RMSE_test)
#print('Mean Absolute Percentage Error Test:', MAPE_test)
print('Accuracy Test', str(accuracy_test) + "%")
print('r2_score Test:', r2_test)

Accuracy Train 92.22614986836747%
r2_score Train: 0.9918824564967289

Accuracy Test 75.95871973219374%
r2_score Test: 0.9822055487332183


# Hyper-parameter optimization through random search

This section is inspired from: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [None]:
from sklearn.model_selection import RandomizedSearchCV

