In [7]:
import requests
import pandas as pd
import numpy as np
from numpy.random import default_rng
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics



# KNN - ethnic destruction on U.S. soil

We try to predict the state based on the distribution of various ethnic groups. To do this we will try the K-Nearest Neighbors model.

In [4]:
url = 'https://github.com/Giofabro/exercise-data-analysis/raw/main/Data_Repository/df_distribution_USA.csv'
response = requests.get(url)
open('df_distribution_USA.csv', 'wb').write(response.content)

df_distribution = pd.read_csv("df_distribution_USA.csv")

In [5]:
df_distribution

Unnamed: 0,Geographic Area,City,poverty_rate,percent_completed_hs,share_white,share_black,share_native_american,share_asian,share_hispanic
0,AL,Abanda CDP,78.8,21.2,67.2,30.2,0.0,0.0,1.6
1,AL,Abbeville city,29.1,69.1,54.4,41.4,0.1,1.0,3.1
2,AL,Adamsville city,25.5,78.9,52.3,44.9,0.5,0.3,2.3
3,AL,Addison town,30.7,81.4,99.1,0.1,0.0,0.1,0.4
4,AL,Akron town,42.0,68.6,13.2,86.5,0.0,0.0,0.3
...,...,...,...,...,...,...,...,...,...
28918,WY,Woods Landing-Jelm CDP,18.6,100.0,95.9,0.0,0.0,2.1,0.0
28919,WY,Worland city,15.3,85.6,89.9,0.3,1.3,0.6,16.6
28920,WY,Wright town,5.9,89.2,94.5,0.1,1.4,0.2,6.2
28921,WY,Yoder town,5.4,79.4,97.4,0.0,0.0,0.0,4.0


## Shuffle

In [8]:
features = ["share_white", "share_black", "share_native_american", "share_asian", "share_hispanic"]

X = df_distribution[features].copy()
y = df_distribution['Geographic Area']

X, y = shuffle(X, y, random_state=12)

In [9]:
X

Unnamed: 0,share_white,share_black,share_native_american,share_asian,share_hispanic
22111,93.6,2.3,0.0,1.7,3.7
28039,97.9,0.3,0.2,0.3,0.8
23499,78.9,1.3,14.5,0.0,3.9
2566,96.9,0.5,0.5,0.2,1.9
14095,92.4,3.0,0.2,2.1,2.9
...,...,...,...,...,...
7409,92.7,1.4,0.3,2.1,6.9
19709,94.0,1.2,0.4,0.6,4.2
25990,96.6,0.0,0.5,0.7,1.5
5787,91.1,0.2,0.6,0.7,12.7


In [10]:
y

22111    PA
28039    WI
23499    SD
2566     CA
14095    MO
         ..
7409     IN
19709    OH
25990    UT
5787     ID
14155    MO
Name: Geographic Area, Length: 28923, dtype: object

In [12]:
print(X.shape)
print(y.shape)

(28923, 5)
(28923,)


## Test & Train

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28923, 5), (28923,), (23138, 5), (23138,), (5785, 5), (5785,))

## KNN

In [14]:
knn_model = KNeighborsClassifier(n_neighbors=15)
knn_model.fit(X_train, y_train)
print(knn_model)

KNeighborsClassifier(n_neighbors=15)


In [15]:
expected_y = y_test
predicted_y = knn_model.predict(X_test)
y_test == predicted_y

17572    False
20322     True
28321    False
14564    False
961      False
         ...  
18062    False
4124     False
46       False
4736     False
15156    False
Name: Geographic Area, Length: 5785, dtype: bool

In [16]:
print('{} predictions on a total of {} test samples are correct'.format(sum(expected_y == predicted_y), len(predicted_y)))

print(metrics.confusion_matrix(expected_y, predicted_y))
percent_predicted = sum(expected_y == predicted_y)/len(predicted_y)*100

1211 predictions on a total of 5785 test samples are correct
[[47  0  0 ...  1  0  0]
 [ 0 19  5 ...  0  2  0]
 [ 1  7  2 ...  5  2  0]
 ...
 [ 2  2  2 ...  6  0  0]
 [ 0  1  2 ...  1  2  0]
 [ 0  0  0 ...  0  0  0]]


In [17]:
percent_predicted

20.933448573898012

In [18]:
print(metrics.classification_report(expected_y, predicted_y))

              precision    recall  f1-score   support

          AK       0.52      0.62      0.56        76
          AL       0.13      0.18      0.15       108
          AR       0.02      0.02      0.02       111
          AZ       0.28      0.29      0.28        97
          CA       0.44      0.67      0.53       352
          CO       0.09      0.07      0.08        98
          CT       0.00      0.00      0.00        30
          DE       0.00      0.00      0.00        22
          FL       0.21      0.25      0.23       172
          GA       0.12      0.15      0.14       123
          HI       0.97      0.83      0.89        41
          IA       0.10      0.14      0.12       204
          ID       0.00      0.00      0.00        42
          IL       0.09      0.11      0.10       277
          IN       0.05      0.03      0.04       153
          KS       0.06      0.04      0.05       137
          KY       0.08      0.08      0.08        93
          LA       0.14    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
new_df_distribution = df_distribution
new_df_distribution['predicted_state'] = knn_model.predict(new_df_distribution[features])

new_df_distribution

Unnamed: 0,Geographic Area,City,poverty_rate,percent_completed_hs,share_white,share_black,share_native_american,share_asian,share_hispanic,predicted_state
0,AL,Abanda CDP,78.8,21.2,67.2,30.2,0.0,0.0,1.6,MS
1,AL,Abbeville city,29.1,69.1,54.4,41.4,0.1,1.0,3.1,GA
2,AL,Adamsville city,25.5,78.9,52.3,44.9,0.5,0.3,2.3,GA
3,AL,Addison town,30.7,81.4,99.1,0.1,0.0,0.1,0.4,IL
4,AL,Akron town,42.0,68.6,13.2,86.5,0.0,0.0,0.3,AL
...,...,...,...,...,...,...,...,...,...,...
28918,WY,Woods Landing-Jelm CDP,18.6,100.0,95.9,0.0,0.0,2.1,0.0,MN
28919,WY,Worland city,15.3,85.6,89.9,0.3,1.3,0.6,16.6,CO
28920,WY,Wright town,5.9,89.2,94.5,0.1,1.4,0.2,6.2,AZ
28921,WY,Yoder town,5.4,79.4,97.4,0.0,0.0,0.0,4.0,IN


## Considerations 

We were only able to predict 20%. The KNN model is not the most suitable for this case. However, it was a lot of fun to implement it. 