In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
geo = pd.read_csv("/content/Updated Geolocation Data US For Risk.csv")

In [3]:
geo.isna().sum() #Checks if there's any NA's

Unnamed: 0,0
Latitude,0
Longitude,0
Neodymium (%),0
Dysprosium (%),0
Terbium (%),0
Yttrium (%),0
Samarium (%),0
Processing Cost (M$),0
Accessibility Score,0
Financial Viability,0


In [4]:
geo.info() #Checks info for any null content

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Latitude              500 non-null    float64
 1   Longitude             500 non-null    float64
 2   Neodymium (%)         500 non-null    float64
 3   Dysprosium (%)        500 non-null    float64
 4   Terbium (%)           500 non-null    float64
 5   Yttrium (%)           500 non-null    float64
 6   Samarium (%)          500 non-null    float64
 7   Processing Cost (M$)  500 non-null    float64
 8   Accessibility Score   500 non-null    int64  
 9   Financial Viability   500 non-null    int64  
 10  Region                500 non-null    object 
 11  Risk                  500 non-null    int64  
dtypes: float64(8), int64(3), object(1)
memory usage: 47.0+ KB


In [5]:
geo.head()

Unnamed: 0,Latitude,Longitude,Neodymium (%),Dysprosium (%),Terbium (%),Yttrium (%),Samarium (%),Processing Cost (M$),Accessibility Score,Financial Viability,Region,Risk
0,37.490802,-86.601106,0.914051,1.058736,0.253465,6.805594,1.970019,184.794217,10,1,Midwest,4
1,49.014286,-95.5147,2.831626,0.961823,0.248014,6.338472,2.329571,422.06521,5,1,Unknown,4
2,44.639879,-107.975981,4.465988,0.050307,0.940053,4.776826,0.78193,502.424363,5,0,West,4
3,41.97317,-80.241274,3.697245,0.703912,0.255341,7.570213,1.827672,66.408135,7,1,Atlantic Ocean,1
4,33.120373,-87.339786,3.894085,0.774438,0.280287,2.374529,1.676122,440.221735,4,1,South,4


In [6]:
geo =geo.drop(["Latitude", "Longitude"], axis = 1)           #Drop unneeded column
geo_num = geo.drop(["Region"], axis = 1)

In [7]:
geo_num.head()

Unnamed: 0,Neodymium (%),Dysprosium (%),Terbium (%),Yttrium (%),Samarium (%),Processing Cost (M$),Accessibility Score,Financial Viability,Risk
0,0.914051,1.058736,0.253465,6.805594,1.970019,184.794217,10,1,4
1,2.831626,0.961823,0.248014,6.338472,2.329571,422.06521,5,1,4
2,4.465988,0.050307,0.940053,4.776826,0.78193,502.424363,5,0,4
3,3.697245,0.703912,0.255341,7.570213,1.827672,66.408135,7,1,1
4,3.894085,0.774438,0.280287,2.374529,1.676122,440.221735,4,1,4


In [8]:
X= geo.drop(["Risk"],axis=1)           #Drop target variable
y=geo["Risk"]

In [9]:
numeric_features=X.select_dtypes(include=['number']).columns.tolist()  #Selecting numbered data types and categorical, splitting them
numeric_features
categorical_features = geo['Region']

In [10]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=.3, random_state=34) #Splits data randomly

In [11]:
#preprocess
preprocessor= ColumnTransformer(
    transformers=[
        ('scaler',MinMaxScaler(), numeric_features),
        ("categ", OneHotEncoder(handle_unknown="ignore"), ["Region"])
    ]
)

In [12]:
#KNN pipeline
pipeline_knn = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier(n_neighbors=5))])

In [13]:
preprocessor.fit(X_train)
X_train_scaled=preprocessor.transform(X_train)
pd.DataFrame(X_train_scaled)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.798739,0.271719,0.733930,0.195362,0.939038,0.915304,0.222222,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.632267,0.916315,0.868694,0.944876,0.338920,0.282194,0.666667,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.024271,0.951229,0.741460,0.814889,0.432992,0.302198,0.111111,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.206683,0.731273,0.435251,0.294315,0.726732,0.310901,0.333333,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.901387,0.767057,0.290648,0.374653,0.796112,0.948605,0.555556,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,0.120488,0.013698,0.054315,0.529062,0.463863,0.651913,1.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
346,0.200606,0.634695,0.394646,0.279660,0.260860,0.580788,0.222222,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
347,0.222525,0.767229,0.266200,0.180378,0.078688,0.078613,1.000000,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
348,0.808621,0.538642,0.973040,0.424976,0.061835,0.212906,0.888889,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [14]:
#Decision Tree Pipeline
pipeline_tree = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(random_state=56))])

In [15]:
#Run KNN
pipeline_knn.fit(X_train, y_train)

In [16]:
#Run Tree
pipeline_tree.fit(X_train, y_train)

In [17]:
#predict
y_pred_knn=pipeline_knn.predict(X_test)
y_pred_tree=pipeline_tree.predict(X_test)


In [18]:
pd.DataFrame(y_pred_knn, y_test)
pd.DataFrame(y_pred_tree, y_test)

Unnamed: 0_level_0,0
Risk,Unnamed: 1_level_1
1,1
2,2
4,4
4,4
4,4
...,...
2,2
4,4
1,1
1,1


In [19]:
#KNN Confusion
print(confusion_matrix(y_test, y_pred_knn))
print(accuracy_score(y_test, y_pred_knn))

[[ 6 10  1]
 [ 7 53 11]
 [ 0 18 44]]
0.6866666666666666


In [20]:
#Tree Confusion
print(confusion_matrix(y_test, y_pred_tree))
print(accuracy_score(y_test, y_pred_tree))


[[16  0  1]
 [ 0 71  0]
 [ 0  0 62]]
0.9933333333333333


In [21]:
print("KNN", classification_report(y_test,y_pred_knn))

KNN               precision    recall  f1-score   support

           1       0.46      0.35      0.40        17
           2       0.65      0.75      0.70        71
           4       0.79      0.71      0.75        62

    accuracy                           0.69       150
   macro avg       0.63      0.60      0.61       150
weighted avg       0.69      0.69      0.68       150



In [22]:
print("Decision Tree", classification_report(y_test,y_pred_tree))

Decision Tree               precision    recall  f1-score   support

           1       1.00      0.94      0.97        17
           2       1.00      1.00      1.00        71
           4       0.98      1.00      0.99        62

    accuracy                           0.99       150
   macro avg       0.99      0.98      0.99       150
weighted avg       0.99      0.99      0.99       150

