In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.model_selection import cross_validate

from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier

In [2]:
lb07A = pd.read_csv("datasets/LB-07A.csv").dropna()
lb08A = pd.read_csv("datasets/LB-08A.csv").dropna()

In [3]:
lb08A["lithology"].value_counts()

Greywacke                      285
shale slate phyllite schist     64
polymict lithic breccia         52
suevite                         18
Name: lithology, dtype: int64

In [4]:
lb07A["lithology"].value_counts()

monomict lithic breccia        104
shale slate phyllite schist     94
suevite                         90
polymict lithic breccia         63
metagreywacke                   12
Name: lithology, dtype: int64

In [5]:
lb08A["lithology"].replace({"Greywacke": "metagreywacke"}, inplace=True)
lb08A["lithology"].value_counts()


metagreywacke                  285
shale slate phyllite schist     64
polymict lithic breccia         52
suevite                         18
Name: lithology, dtype: int64

In [6]:
label_encoder = LabelEncoder()
label_encoder.fit(lb07A["lithology"])


In [7]:
x_train = lb07A.drop(["lithology"], axis=1)
y_train = label_encoder.transform(lb07A["lithology"])

x_test = lb08A.drop(["lithology"], axis=1)
y_test = label_encoder.transform(lb08A["lithology"])


In [8]:
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import cross_validate


class Percent(float):
    def __str__(self):
        return '{:.2%}'.format(self)


def get_predictions(estimator):
    estimator.fit(x_train, y_train)
    prediction = estimator.predict(x_test)
    return prediction


def generate_scores(file_name, y_predicted):

    accuracy = accuracy_score(y_test, y_predicted)
    f1 = f1_score(y_test, y_predicted, average="weighted")
    recall = recall_score(y_test, y_predicted, average="weighted")

    confusion_matrix_df = pd.crosstab(
        pd.Series(y_predicted, name="Predicted"), pd.Series(y_test, name="Actual"))

    path = "./07A_08A metrics/" + file_name + ".txt"

    with open(path, "w") as file:
        file.writelines([file_name,
                         "\n\nAccuracy:    \t" + str(Percent(accuracy)),
                         "\nF1:         \t" + str(Percent(f1)),
                         "\nRecall:     \t" + str(Percent(recall)),
                         "\n\nConfusion Matrix\n",
                         confusion_matrix_df.to_string(),
                         ])

    with open(path, "r") as file:
        print(file.read())


def run(estimator, model_name):
    predictions = get_predictions(estimator)
    generate_scores(model_name, predictions)


In [9]:
rf = RandomForestClassifier(n_jobs=-1, random_state=0)
run(rf, "Random Forest")



Random Forest

Accuracy:    	10.74%
F1:         	3.97%
Recall:     	10.74%

Confusion Matrix
Actual       0   2   3   4
Predicted                 
1           97   5   0   0
2          146  44  51  18
3            1   1   1   0
4           41   2  12   0


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=0)
run(dt, "Decision Tree")

Decision Tree

Accuracy:    	11.22%
F1:         	3.58%
Recall:     	11.22%

Confusion Matrix
Actual       0   2   3   4
Predicted                 
1           50   5   0   0
2          159  47  50  18
4           76   0  14   0


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
from sklearn.svm import SVC

svc = SVC()
run(svc, "SVM")

  _warn_prf(average, modifier, msg_start, len(result))


SVM

Accuracy:    	3.10%
F1:         	0.85%
Recall:     	3.10%

Confusion Matrix
Actual       0   2   3   4
Predicted                 
1           92  29   1   6
2           50   2  12   1
3           18   4   0   0
4          125  17  51  11


In [12]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
run(gnb, "Gaussian Naive Bayes")

Gaussian Naive Bayes

Accuracy:    	11.22%
F1:         	3.36%
Recall:     	11.22%

Confusion Matrix
Actual       0   2   3   4
Predicted                 
1          112   5   0   0
2          173  47  57  18
4            0   0   7   0


  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression(n_jobs=-1, max_iter=1000)
run(lg, "Logistic Regression")

Logistic Regression

Accuracy:    	10.50%
F1:         	3.09%
Recall:     	10.50%

Confusion Matrix
Actual       0   2   3   4
Predicted                 
1           60   4   0   0
2          175  44  64  18
4           50   4   0   0


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_jobs=-1)
run(knn, "KNN")

KNN

Accuracy:    	4.30%
F1:         	1.47%
Recall:     	4.30%

Confusion Matrix
Actual       0   2   3   4
Predicted                 
1           84  14   0   4
2          105   8  25   4
3            1   0   0   0
4           95  30  39  10


  _warn_prf(average, modifier, msg_start, len(result))
