In [None]:
#import all the necessary libraries
import pandas as pd
import numpy as np
from numpy import vstack
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from pickle import dump, load
import torch
from torch import nn
from torch.utils.data import Dataset, random_split, DataLoader

In [None]:
class MushroomsData(Dataset):
  def __init__(self, path):
    data = pd.read_csv(path)
    data = data.drop(str(data.columns[11]),axis=1) #remove stalk-root feature
    data = data.drop(str(data.columns[15]),axis=1) #remove veil-type feature
    encoder = OrdinalEncoder()
    label_encoder = LabelEncoder()
    self.X = encoder.fit_transform(data.values[:,1:])
    self.y = label_encoder.fit_transform(data.values[:,0])
    self.X = self.X.astype("float32")
    self.y = self.y.astype("float32")
    self.y = self.y.reshape(len(self.y), 1)
    dump(encoder, open("/content/drive/MyDrive/dataset/encoder.pkl", 'wb')) #save the encoder
 
  def __len__(self):
    return len(self.y)
 
  def __getitem__(self, idx):
    return [self.X[idx], self.y[idx]]
 
  def split_data(self):
    test_size = round(0.2 * len(self.X))
    train_size = len(self.X) - test_size
    return random_split(self, [train_size, test_size])

In [None]:
#pre-process the data
def prepare_data(path):
  dataset = MushroomsData(path)
  train_data, test_data = dataset.split_data()
  train_dl = DataLoader(train_data, batch_size=32, shuffle=True)
  test_dl = DataLoader(test_data, batch_size=32)
  return train_dl, test_dl
 
#train the model on training data
def train_model(train_dl, model):
  criterion = nn.BCELoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
  for epoch in range(300):
    for i, (inputs, targets) in enumerate(train_dl):
      optimizer.zero_grad()
      yhat = model(inputs)
      loss = criterion(yhat, targets)
      loss.backward()
      optimizer.step()
    if epoch % 20 == 0:
      print("epoch: ", epoch)
 
#test the model on test data
def evaluate_model(test_dl, model):
  predictions, actuals = list(), list()
  for i, (inputs, targets) in enumerate(test_dl):
    yhat = model(inputs)
    yhat = yhat.detach().numpy()
    actual = targets.numpy()
    actual = actual.reshape(len(actual), 1)
    yhat = yhat.round()
    predictions.append(yhat)
    actuals.append(actual)   
  predictions, actuals = vstack(predictions), vstack(actuals)
  acc = accuracy_score(actuals, predictions)
  return acc

In [None]:
#define a model
model = nn.Sequential(nn.Linear(20,30),
                      nn.ReLU(),
                      nn.Linear(30, 20),
                      nn.ReLU(),
                      nn.Linear(20, 10),
                      nn.ReLU(),
                      nn.Linear(10, 1),
                      nn.Sigmoid())

In [None]:
path = "/content/drive/MyDrive/dataset/mushrooms.csv" #path for the dataset
train_dl, test_dl = prepare_data(path) #pre-process and split the data to train and test set
train_model(train_dl, model) #train the model
accuracy = evaluate_model(test_dl, model) #test the model
print(accuracy)

epoch:  0
epoch:  20
epoch:  40
epoch:  60
epoch:  80
epoch:  100
epoch:  120
epoch:  140
epoch:  160
epoch:  180
epoch:  200
epoch:  220
epoch:  240
epoch:  260
epoch:  280
0.9963076923076923


In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/dataset/model.pth") #save the model