In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
# Load dataset
df = pd.read_csv('Training Dataset/training datalist.csv')

# data cleaning
df['PPD'].fillna(0, inplace=True)
df.dropna(subset=['Voice handicap index - 10'], inplace=True)
df.reset_index(drop=True, inplace=True)

columns_to_drop = ['ID', 'Disease category', 'Diabetes', 'Hypertension', 'CAD', 'Head and Neck Cancer', 'Head injury', 'CVA', 'Onset of dysphonia ', 'Diurnal pattern']
# get certain columns as features
features = df.columns.drop(columns_to_drop).to_list()
# print(features)

In [3]:
# split data into train and test
from sklearn.model_selection import train_test_split

X = df[features]
y = df['Disease category']

# split the data with label evenly distributed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y, random_state=101)

# Print the class distribution in the training set
unique_labels, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f"Label {label}: {count} samples")

Label 1: 372 samples
Label 2: 153 samples
Label 3: 117 samples
Label 4: 31 samples
Label 5: 22 samples


In [4]:
# Define your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y - 1
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        try:
            tup = self.X[index].astype(np.float32), self.y[index].astype(np.int64)
        except:
            import ipdb
            ipdb.set_trace()
        return tup

# Define your model class
class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_size, hidden_size*4)
        self.fc3 = nn.Linear(hidden_size*4, num_classes)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out


In [5]:
# Define hyperparameters
input_size = 18
hidden_size = 20
num_classes = 5
learning_rate = 0.001
batch_size = 32
num_epochs = 50

In [6]:
# Create your dataset and dataloader
train_dataset = CustomDataset(X_train.values, y_train.values)  # Replace X and y with your own data
test_dataset = CustomDataset(X_test.values, y_test.values)  # Replace X and y with your own data
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [7]:
# Create your model
model = Classifier(input_size, hidden_size, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [8]:
# Training loop
for epoch in range(num_epochs):
    for batch_idx, (inputs, labels) in enumerate(train_dataloader):
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Print training progress
        if (batch_idx+1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")

Epoch [1/50], Step [10/22], Loss: 1.3876
Epoch [1/50], Step [20/22], Loss: 1.2799
Epoch [2/50], Step [10/22], Loss: 1.2382
Epoch [2/50], Step [20/22], Loss: 1.1173
Epoch [3/50], Step [10/22], Loss: 1.4101
Epoch [3/50], Step [20/22], Loss: 1.2449
Epoch [4/50], Step [10/22], Loss: 1.2664
Epoch [4/50], Step [20/22], Loss: 1.1704
Epoch [5/50], Step [10/22], Loss: 1.2380
Epoch [5/50], Step [20/22], Loss: 1.0985
Epoch [6/50], Step [10/22], Loss: 1.0712
Epoch [6/50], Step [20/22], Loss: 0.9054
Epoch [7/50], Step [10/22], Loss: 1.1080
Epoch [7/50], Step [20/22], Loss: 1.4039
Epoch [8/50], Step [10/22], Loss: 1.3415
Epoch [8/50], Step [20/22], Loss: 1.1690
Epoch [9/50], Step [10/22], Loss: 1.1338
Epoch [9/50], Step [20/22], Loss: 1.1422
Epoch [10/50], Step [10/22], Loss: 0.9198
Epoch [10/50], Step [20/22], Loss: 0.9277
Epoch [11/50], Step [10/22], Loss: 0.7818
Epoch [11/50], Step [20/22], Loss: 0.9087
Epoch [12/50], Step [10/22], Loss: 1.2322
Epoch [12/50], Step [20/22], Loss: 0.9670
Epoch [13/

In [9]:
# Evaluation
model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for inputs, labels in test_dataloader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total_samples += labels.size(0)
        total_correct += (predicted == labels).sum().item()
    
    accuracy = total_correct / total_samples
    print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.6644


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    predicted_labels = []
    targets = []

    for inputs, labels in test_dataloader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total_samples += labels.size(0)
        total_correct += (predicted == labels).sum().item()

        predicted_labels.extend(predicted.tolist())
        targets.extend(labels.tolist())

    accuracy = accuracy_score(targets, predicted_labels)
    precision = precision_score(targets, predicted_labels, average='weighted', zero_division=1)
    recall = recall_score(targets, predicted_labels, average='weighted', zero_division=1)
    f1 = f1_score(targets, predicted_labels, average='weighted', zero_division=1)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 score: {f1:.4f}")

    print(classification_report(targets, predicted_labels, zero_division=1))
    print(confusion_matrix(targets, predicted_labels))

Accuracy: 0.6644
Precision: 0.6479
Recall: 0.6644
F1 score: 0.6221
              precision    recall  f1-score   support

           0       0.71      0.90      0.79       160
           1       0.65      0.34      0.44        65
           2       0.56      0.64      0.60        50
           3       0.00      0.00      0.00        13
           4       1.00      0.00      0.00        10

    accuracy                           0.66       298
   macro avg       0.58      0.38      0.37       298
weighted avg       0.65      0.66      0.62       298

[[144   6   8   2   0]
 [ 30  22  13   0   0]
 [ 13   5  32   0   0]
 [  8   1   4   0   0]
 [  9   0   0   1   0]]
