In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier

In [2]:
# convert file to dataFrame
df = pd.read_csv("database_join.csv")
df.head()

Unnamed: 0,index,zip_code,total_population,total_male_population,total_female_popularion,eight_teen_plus,eight_teen_plus_male,eight_teen_plus_female,population_eight_teen_to_twenty_four,eight_teen_to_twenty_four_not_high_school_graduate,...,index-2,permit_number,permit_subtype,date_entered,date_issued,business_name,zip_code-2,latitude,longitude,permit_duration
0,0,37013,97939,47658,50281,59479,27984,31495,9099,1073,...,0,19-25846,ONSALES,2019-01-24,2019-06-03,EXO-TIKKA INDIAN CUSINE,37013,36.047649,-86.648119,797 days
1,18,37209,38664,18251,20413,29326,13363,15963,5522,337,...,1,16-20340,ONSALES,2016-05-18,2016-09-23,Las Palmas Mexican Restaurant,37209,36.149076,-86.863105,1780 days
2,24,37215,22379,9989,12390,17874,7986,9888,1774,222,...,2,19-25920,ONOFFSALES,2019-03-06,2019-06-13,NASHVILLE CIGAR,37215,36.104329,-86.815039,787 days
3,15,37206,26382,12473,13909,20213,9555,10658,1990,228,...,3,19-25985,ONOFFSALES,2019-04-22,2019-06-13,TAILGATE BREWERY EAST NASHVILLE,37206,36.186415,-86.74721,787 days
4,18,37209,38664,18251,20413,29326,13363,15963,5522,337,...,4,17-25076,ONSALES,2017-06-07,2018-03-02,BARE BONES BUTCHER,37209,36.158192,-86.84889,1255 days


In [3]:
# view list of column names
df.columns.values.tolist()

['index',
 'zip_code',
 'total_population',
 'total_male_population',
 'total_female_popularion',
 'eight_teen_plus',
 'eight_teen_plus_male',
 'eight_teen_plus_female',
 'population_eight_teen_to_twenty_four',
 'eight_teen_to_twenty_four_not_high_school_graduate',
 'eight_teen_to_twenty_four_high_school_graduate',
 'eight_teen_to_twenty_four_bachelors_degree_or_higher',
 'population_twenty_five_to_thirty_four',
 'twenty_five_to_thirty_four_not_high_school_graduate',
 'twenty_five_to_thirty_four_high_school_graduate',
 'twenty_five_to_thirty_four_bachelors_degree_or_higher',
 'population_thirty_five_to_fourty_four',
 'thirty_five_to_fourty_four_not_high_school_graduate',
 'thirty_five_to_fourty_four_high_school_graduate',
 'thirty_five_to_fourty_four_bachelors_degree_or_higher',
 'population_sixty_five_and_up',
 'sixty_five_and_up_not_high_school_graduate',
 'sixty_five_and_up_high_school_graduate',
 'sixty_five_and_up_bachelors_degree_or_higher',
 'average_household_income',
 'index-2

In [4]:
# limit dataframe for the model
df = df[["zip_code","total_male_population","total_female_popularion",
 "average_household_income","permit_number","permit_duration"]]
df.head()

Unnamed: 0,zip_code,total_male_population,total_female_popularion,average_household_income,permit_number,permit_duration
0,37013,47658,50281,26832,19-25846,797 days
1,37209,18251,20413,32539,16-20340,1780 days
2,37215,9989,12390,88326,19-25920,787 days
3,37206,12473,13909,36851,19-25985,787 days
4,37209,18251,20413,32539,17-25076,1255 days


In [5]:
# split permit duration to get days for calculation puposes
days = df['permit_duration'].str.split(' ', n=1, expand =True)
df["days"] = days[0].astype(int)


In [6]:
# create column with success criteria in binary form.  1 is successful, 0 is not. 
df["success"] = 0
df.loc[df["days"] >= 730, "success"] = 1

df.head()

Unnamed: 0,zip_code,total_male_population,total_female_popularion,average_household_income,permit_number,permit_duration,days,success
0,37013,47658,50281,26832,19-25846,797 days,797,1
1,37209,18251,20413,32539,16-20340,1780 days,1780,1
2,37215,9989,12390,88326,19-25920,787 days,787,1
3,37206,12473,13909,36851,19-25985,787 days,787,1
4,37209,18251,20413,32539,17-25076,1255 days,1255,1


In [7]:
# create dataframe for permit numbers - does not need to be scaled
pn_df = pd.DataFrame(df['permit_number']).set_index(df.index)

In [8]:
# remove unnecessary columns
df.drop(columns=["permit_number", "permit_duration", "days"], inplace = True)

In [9]:
# Define the features set.
X = df.copy()
X = X.drop("success", axis=1)
y = df["success"]

In [10]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [13]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [14]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [15]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [16]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1,84
Actual 1,1,240


In [17]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1,84
Actual 1,1,240


Accuracy Score : 0.7392638036809815
Classification Report
              precision    recall  f1-score   support

           0       0.50      0.01      0.02        85
           1       0.74      1.00      0.85       241

    accuracy                           0.74       326
   macro avg       0.62      0.50      0.44       326
weighted avg       0.68      0.74      0.63       326



In [18]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.33199199, 0.19798904, 0.21500924, 0.25500972])

In [19]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3319919929818173, 'zip_code'),
 (0.2550097223266372, 'average_household_income'),
 (0.21500924146020775, 'total_female_popularion'),
 (0.1979890432313377, 'total_male_population')]