In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel="linear", C=0.025, random_state=50),
    SVC(gamma=2, C=1, random_state=50),
    DecisionTreeClassifier(max_depth=5, random_state=50),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=50
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=50),
    AdaBoostClassifier(algorithm="SAMME", random_state=50),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

In [3]:
# Import data.csv and labels.csv as pandas DataFrames
data = pd.read_csv("data_noise_D_01.csv")
labels = pd.read_csv("labels.csv")

# Convert DataFrames to numpy arrays
data = data.values
labels = labels.values

# Only use X values
data = data[:, 2:]

# Shuffle the data and labels, split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.25, random_state=50, shuffle=True
)

y_train = y_train.ravel()
y_test = y_test.ravel()

In [4]:
# Train all classifiers using data and labels
for name, clf in zip(names, classifiers):
    print(name)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(score)

Nearest Neighbors
0.58992
Linear SVM
0.49616
RBF SVM
0.49704
Decision Tree
0.53096
Random Forest
0.5656
Neural Net
0.76216
AdaBoost
0.5584
Naive Bayes
0.64872
QDA
0.77528




In [5]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
importances = clf.feature_importances_
print(importances)

[0.         0.01573018 0.01758408 0.01807165 0.01643604 0.01820931
 0.01781529 0.01740578 0.01818136 0.01795035 0.01749141 0.01761255
 0.01741131 0.01771029 0.01681474 0.01697504 0.01637905 0.01804993
 0.01630096 0.01685625 0.01606304 0.01683696 0.01693159 0.01687195
 0.01673623 0.0170238  0.0163656  0.01617579 0.01594678 0.01669432
 0.01647885 0.01714602 0.01611879 0.01648981 0.01702921 0.01679599
 0.01711683 0.0174165  0.01696025 0.0174078  0.01794646 0.01679017
 0.01728277 0.01753026 0.01704701 0.01718456 0.016921   0.01675463
 0.01702841 0.01675899 0.01652156 0.01694603 0.01701844 0.01617211
 0.01660719 0.01686091 0.01600313 0.01549033 0.01724411 0.01630024]


In [6]:
from sklearn.inspection import permutation_importance

clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train, y_train)

result = permutation_importance(clf, X_train, y_train, n_repeats=10, random_state=0)
importance = result.importances_mean

print(importance)



[0.         0.00658684 0.00655751 0.00687485 0.00673085 0.00660018
 0.00904557 0.0087709  0.00812288 0.00701619 0.00820555 0.0087149
 0.00754953 0.00674151 0.00739486 0.00692818 0.00630683 0.00692285
 0.0061895  0.00664018 0.00636017 0.0064135  0.00703485 0.0060695
 0.00658684 0.00720019 0.00703752 0.00721886 0.00595483 0.00715486
 0.00704285 0.00760554 0.00588816 0.00708286 0.00856023 0.00630417
 0.00658684 0.00592549 0.00648817 0.00680551 0.0060775  0.00672018
 0.00719753 0.00664284 0.00629083 0.0063015  0.00502947 0.00652551
 0.00533614 0.00542148 0.00528547 0.00387744 0.00490413 0.00434145
 0.00466946 0.00460546 0.00421345 0.00433612 0.00293875 0.00533348]
