In [271]:
# importing metrics
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report


In [272]:
# Import the dataset
df = pd.read_csv("AL_NJ_Income_pct.csv")
df.head()

Unnamed: 0,STATEFIPS,STATE,zipcode,Returns_pct1,Returns_pct2,Returns_pct3,Returns_pct4,Returns_pct5,Returns_pct6,total
0,1,AL,35004,28.98,27.06,18.23,12.48,12.09,1.15,5210
1,1,AL,35005,42.26,30.97,14.52,6.45,5.81,0.0,3100
2,1,AL,35006,35.83,27.5,15.83,10.0,10.83,0.0,1200
3,1,AL,35007,34.09,22.19,15.24,11.22,15.24,2.01,11940
4,1,AL,35010,45.76,26.17,11.25,6.7,7.84,2.28,7910


In [273]:
df.isna().sum()

STATEFIPS       0
STATE           0
zipcode         0
Returns_pct1    0
Returns_pct2    0
Returns_pct3    0
Returns_pct4    0
Returns_pct5    0
Returns_pct6    0
total           0
dtype: int64

In [274]:
# Split the dataset into features and target
lb = LabelEncoder()
df['STATE'] = lb.fit_transform(df['STATE'])

In [275]:
# Split the dataset into training and testing sets
X = df.iloc[:,3:9]
Y = df['STATE']

In [276]:
# Create a Random Forest classifier
X

Unnamed: 0,Returns_pct1,Returns_pct2,Returns_pct3,Returns_pct4,Returns_pct5,Returns_pct6
0,28.98,27.06,18.23,12.48,12.09,1.15
1,42.26,30.97,14.52,6.45,5.81,0.00
2,35.83,27.50,15.83,10.00,10.83,0.00
3,34.09,22.19,15.24,11.22,15.24,2.01
4,45.76,26.17,11.25,6.70,7.84,2.28
...,...,...,...,...,...,...
1162,23.88,22.39,19.40,11.94,17.91,4.48
1163,23.25,13.63,10.22,9.02,24.85,19.04
1164,50.37,30.12,10.18,3.89,4.21,1.23
1165,29.67,22.66,14.69,10.27,17.09,5.62


In [277]:
Y

0       0
1       0
2       0
3       0
4       0
       ..
1162    1
1163    1
1164    1
1165    1
1166    1
Name: STATE, Length: 1167, dtype: int64

In [278]:
# Fit the classifier to the training data
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

In [279]:
# Make predictions on the testing data
RF = RandomForestClassifier()

In [280]:
RF.fit(X_train,y_train)

RandomForestClassifier()

In [281]:
y_pred = RF.predict(X_test)

In [282]:
print(classification_report(y_test,y_pred, target_names=['AL','NJ']))

              precision    recall  f1-score   support

          AL       0.86      0.81      0.83       188
          NJ       0.79      0.85      0.82       163

    accuracy                           0.83       351
   macro avg       0.83      0.83      0.83       351
weighted avg       0.83      0.83      0.83       351



In [283]:
# Compute evaluation metrics
# What is the accuracy of your model?
# What is the precision of the model?
# What is the recall of the model?
# What is the F1 of the model?
print('Accuracy: ', accuracy_score(y_test,y_pred))
print('Precision: ', precision_score(y_test,y_pred))
print('Recall: ', recall_score(y_test,y_pred))
print('F1: ', f1_score(y_test,y_pred))

Accuracy:  0.8262108262108262
Precision:  0.7931034482758621
Recall:  0.8466257668711656
F1:  0.8189910979228486


In [284]:
y_test = lb.inverse_transform(y_test)
y_pred = lb.inverse_transform(y_pred)

In [285]:
# Show the cross tabulation of the classification. 
pd.crosstab(y_test,y_pred, colnames=['STATE'], rownames=['STATE'])

STATE,AL,NJ
STATE,Unnamed: 1_level_1,Unnamed: 2_level_1
AL,152,36
NJ,25,138
