In [10]:
from matplotlib import pyplot as plt
from scipy.io.arff import loadarff
from sklearn import metrics, tree
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import pandas as pd
import seaborn as sns


# Reading the ARFF file
data = loadarff('pd_speech.arff')
df = pd.DataFrame(data[0])
df['class'] = df['class'].str.decode('utf-8')

# Split in data and targets
X = df.drop('class', axis=1)
y = df['class']

labels = list(set(y))

# Split data in test and training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=1, stratify=y)

# Feature selection
selector = SelectKBest(mutual_info_classif, k=20)
selector.fit_transform(X_train, y_train)
cols_indexes = selector.get_support(indices=True)
selected_columns = X.iloc[:,cols_indexes].columns.tolist()

# Reduce training and testing sets
X_train_reduced = X_train[selected_columns]
X_test_reduced = X_test[selected_columns]

# Create and train the classifier
classifier = tree.DecisionTreeClassifier()
print("Training classifier...")
classifier = classifier.fit(X_train_reduced, y_train)

# Test classifier and compute scores
print("Testing classifier...")
y_predicted = classifier.predict(X_test_reduced)

accuracy = classifier.score(X_test_reduced, y_test)
train_accuracy = classifier.score(X_train_reduced, y_train)

print(train_accuracy)
print(accuracy)


# BRINCADEIRAS DO GONCALO
# Plot everything
confusion_matrix = metrics.confusion_matrix(y_test, y_predicted)
matrix_df = pd.DataFrame(confusion_matrix)
ax = plt.axes()
sns.set(font_scale=1.3)
plt.figure(figsize=(10,7))
sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")
ax.set_title('Confusion Matrix - Decision Tree')
ax.set_xlabel("Predicted label", fontsize =15)
ax.set_ylabel("True Label", fontsize=15)
try:
    ax.set_xticklabels(list(y))
    ax.set_yticklabels(list(y), rotation = 0)
except:
    pass
plt.show()


Training classifier...
Testing classifier...
1.0
0.8061674008810573


'\n# BRINCADEIRAS DO GONCALO\n# Plot everything\nconfusion_matrix = metrics.confusion_matrix(y_test, y_predicted)\nmatrix_df = pd.DataFrame(confusion_matrix)\nax = plt.axes()\nsns.set(font_scale=1.3)\nplt.figure(figsize=(10,7))\nsns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")\nax.set_title(\'Confusion Matrix - Decision Tree\')\nax.set_xlabel("Predicted label", fontsize =15)\nax.set_ylabel("True Label", fontsize=15)\ntry:\n    ax.set_xticklabels(list(y))\n    ax.set_yticklabels(list(y), rotation = 0)\nexcept:\n    pass\nplt.show()\n'