In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
import pickle
from sklearn.decomposition import PCA
from itertools import cycle
import matplotlib.pylab as plt

### Find missing values

In [None]:
data = pd.read_csv('./data/agaricus-lepiota.data')
data.replace("?", pd.NA, inplace=True)
data.isna().any()

### Preprocessing data by removing all rows that have missing values 

In [None]:
data = data.dropna()
data

### Encoding the data to numbers

In [None]:
lben = LabelEncoder()

for column in data.columns:
    data[column] = lben.fit_transform(data[column])
    
data

### Split the data into two parts: a training part and a testing part

In [75]:
features = data.drop(columns='p')
target = data['p']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=1/3.0, random_state=42)
print(f"Length of X train: {len(X_train)}")
print(f"Length of y train: {len(y_train)}")
print(f"Length of X test: {len(X_test)}")
print(f"Length of y test: {len(y_test)}")

### Training data with Support Vector Machine model

In [77]:
perceptron_model = Perceptron(shuffle=True, random_state=0)
perceptron_model.fit(X_train, y_train)

y_predict = perceptron_model.predict(X_test)

In [None]:
print(f"Weight: {perceptron_model.coef_}")
print(f"Intercept: {perceptron_model.intercept_}")
print(f"Iteration: {perceptron_model.n_iter_}")

### Save the model

In [79]:
pickle.dump(perceptron_model, open("Perceptron_Model.sav", "wb"))

### Calculate the accuracy, recallm precision and F1

In [None]:
y_test_values, y_test_counts = np.unique(y_test, return_counts=True)
plt.bar([str(value) for value in y_test_values], y_test_counts, width=0.4)
plt.xlabel("Type of label")
plt.ylabel("Number label of each type")
plt.title("Number label of each type of Y Test")

In [None]:
y_pred_values, y_pred_counts = np.unique(y_predict, return_counts=True)
plt.bar([str(value) for value in y_pred_values], y_pred_counts, width=0.4)
plt.xlabel("Type of label")
plt.ylabel("Number label of each type")
plt.title("Number label of each type of Y Predict")

In [None]:
groups = ["0", "1"]
X_axis = np.arange(len(groups)) 

    
plt.figure(figsize=(9,8))



for bar in plt.bar(X_axis - 0.2, y_test_counts, width=0.35, label="Y test"):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, yval, ha='center', va='bottom')
    
for bar in plt.bar(X_axis + 0.2, y_pred_counts, width=0.35, label="Y Predict"):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, yval, ha='center', va='bottom')


plt.xticks(X_axis, groups) 
plt.xlabel("Groups")
plt.ylabel("Number label of each type in each Y")
plt.title("Compare Y test and Y predict")
plt.legend()

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

accuracy = accuracy_score(y_test, y_predict)
accuracy

In [None]:
confusion_matrix(y_predict, y_test)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_predict, y_test).ravel()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2*(precision*recall)/(precision+recall)

print(f"Precisioin = {precision}")
print(f"Recall = {recall}")
print(f"F1 = {f1}")

### Transform 22-D data to 2-D data and display it 

In [None]:
pca_model = PCA(n_components = 2, whiten = True).fit(features)
features_pca = pca_model.transform(features)

pd.DataFrame(data = features_pca)

In [None]:
colors = cycle("rgb")
target_names = [0, 1]

plt.figure()

target_list = target.to_numpy().flatten()
for t_name, c in zip(target_names, colors):
    plt.scatter(features_pca[target_list == t_name, 0], features_pca[target_list == t_name, 1], c=c, label=t_name)

plt.legend()    
plt.show()

In [None]:
# Plot the reduced data
plt.figure(figsize=(10, 6))
plt.scatter(features_pca[:, 0], features_pca[:, 1], c=target, cmap='viridis', edgecolor='k', s=50)

# Create a mesh grid for the decision boundary
x_min, x_max = features_pca[:, 0].min() - 1, features_pca[:, 0].max() + 1
y_min, y_max = features_pca[:, 1].min() - 1, features_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

# Predict on the mesh grid
Z = perceptron.predict(pca_model.inverse_transform(np.c_[xx.ravel(), yy.ravel()]))
Z = Z.reshape(xx.shape)

# Plot decision boundary
plt.contourf(xx, yy, Z, alpha=0.2, cmap='viridis')
plt.title("Perceptron Model Decision Boundary with PCA Reduction")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar(label='Class Label')
plt.show()
