In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.model_selection import cross_validate

from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier

In [2]:
lb07A = pd.read_csv("datasets/LB-07A.csv").dropna()
lb08A = pd.read_csv("datasets/LB-08A.csv").dropna()

In [3]:
lb08A["lithology"].value_counts()

Greywacke                      285
shale slate phyllite schist     64
polymict lithic breccia         52
suevite                         18
Name: lithology, dtype: int64

In [4]:
lb07A["lithology"].value_counts()

monomict lithic breccia        104
shale slate phyllite schist     94
suevite                         90
polymict lithic breccia         63
metagreywacke                   12
Name: lithology, dtype: int64

In [5]:
lb08A["lithology"].replace({"Greywacke": "metagreywacke"}, inplace=True)
lb08A["lithology"].value_counts()


metagreywacke                  285
shale slate phyllite schist     64
polymict lithic breccia         52
suevite                         18
Name: lithology, dtype: int64

In [6]:
label_encoder = LabelEncoder()
# fit on 07A because it contains labels not in 08A
label_encoder.fit(lb07A["lithology"])


In [7]:
x_train = lb08A.drop(["lithology"], axis=1)
y_train = label_encoder.transform(lb08A["lithology"])

x_test = lb07A.drop(["lithology"], axis=1)
y_test = label_encoder.transform(lb07A["lithology"])


In [8]:
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import cross_validate


class Percent(float):
    def __str__(self):
        return '{:.2%}'.format(self)


def get_predictions(estimator):
    estimator.fit(x_train, y_train)
    prediction = estimator.predict(x_test)
    return prediction


def generate_scores(file_name, y_predicted):

    accuracy = accuracy_score(y_test, y_predicted)
    f1 = f1_score(y_test, y_predicted, average="weighted")
    recall = recall_score(y_test, y_predicted, average="weighted")

    confusion_matrix_df = pd.crosstab(
        pd.Series(y_predicted, name="Predicted"), pd.Series(y_test, name="Actual"))

    path = "./08A_07A metrics/" + file_name + ".txt"

    with open(path, "w") as file:
        file.writelines([file_name,
                         "\n\nAccuracy:    \t" + str(Percent(accuracy)),
                         "\nF1:         \t" + str(Percent(f1)),
                         "\nRecall:     \t" + str(Percent(recall)),
                         "\n\nConfusion Matrix\n",
                         confusion_matrix_df.to_string(),
                         ])

    with open(path, "r") as file:
        print(file.read())


def run(estimator, model_name):
    predictions = get_predictions(estimator)
    generate_scores(model_name, predictions)


In [9]:
rf = RandomForestClassifier(n_jobs=-1, random_state=0)
run(rf, "Random Forest")



Random Forest

Accuracy:    	2.75%
F1:         	1.90%
Recall:     	2.75%

Confusion Matrix
Actual     0   1   2   3   4
Predicted                   
0          6  99  54  30  79
2          6   5   0  60   0
3          0   0   9   4  11


In [10]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=0)
run(dt, "Decision Tree")

Decision Tree

Accuracy:    	7.44%
F1:         	6.09%
Recall:     	7.44%

Confusion Matrix
Actual      0   1   2   3   4
Predicted                    
0          12  48  47  79  76
2           0  53   0   0   4
3           0   1  14  15  10
4           0   2   2   0   0


In [11]:
from sklearn.svm import SVC

svc = SVC()
run(svc, "SVM")

SVM

Accuracy:    	3.31%
F1:         	0.21%
Recall:     	3.31%

Confusion Matrix
Actual      0    1   2   3   4
Predicted                     
0          12  104  63  94  90


In [12]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
run(gnb, "Gaussian Naive Bayes")

Gaussian Naive Bayes

Accuracy:    	5.23%
F1:         	3.42%
Recall:     	5.23%

Confusion Matrix
Actual      0   1   2   3   4
Predicted                    
0          12  72  61  80  68
2           0  32   2   9  22
3           0   0   0   5   0


In [13]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression(n_jobs=-1, max_iter=1000)
run(lg, "Logistic Regression")

Logistic Regression

Accuracy:    	3.31%
F1:         	0.21%
Recall:     	3.31%

Confusion Matrix
Actual      0    1   2   3   4
Predicted                     
0          12  101  63  94  90
2           0    3   0   0   0


In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_jobs=-1)
run(knn, "KNN")

KNN

Accuracy:    	5.79%
F1:         	2.98%
Recall:     	5.79%

Confusion Matrix
Actual      0    1   2   3   4
Predicted                     
0          12  100  36  85  52
3           0    4  27   9  38
