# COMP3330 - Individual Assignment - Nathan Vilmen

## Question 2: BUPA Liver Disorders Dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('bupa.data', header=None, names=['var1', 'var2', 'var3', 'var4', 'var5', 'alcohol_consumption', 'train_test_split'])

# Remove the seventh variable
data = data.drop('train_test_split', axis=1)
print(data)

# Create a new binary target variable
data['target'] = (data['alcohol_consumption'] >= 3).astype(int)

# Check the distribution of the target variable
print(data['target'].value_counts())

# we assume here that the 2 classes are relatively balanced, as stated in the assignment sheet.
# we can verify this by computing the data['target'].value_counts(), where we see 176 '1' and 169 '0'.


     var1  var2  var3  var4  var5  alcohol_consumption
0      85    92    45    27    31                  0.0
1      85    64    59    32    23                  0.0
2      86    54    33    16    54                  0.0
3      91    78    34    24    36                  0.0
4      87    70    12    28    10                  0.0
..    ...   ...   ...   ...   ...                  ...
340    99    75    26    24    41                 12.0
341    96    69    53    43   203                 12.0
342    98    77    55    35    89                 15.0
343    91    68    27    26    14                 16.0
344    98    99    57    45    65                 20.0

[345 rows x 6 columns]
1    176
0    169
Name: target, dtype: int64


### Split the dataset into training and testing datasets

In [2]:
# with X as not-normalized values

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import numpy as np

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Initialize the scaler and SVM model
scaler = StandardScaler()
clf = SVC(kernel='linear')

# Initialize KFold with 10 folds
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize lists to store accuracy and predictions for each fold
acc_scores = []
y_preds = []

# Loop over each fold
for i, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
    print(f'Fold {i+1}:')
    
    # Split the data into training and test sets for this fold
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Scale the input features for the training and test sets
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train the model on the training set
    clf.fit(X_train_scaled, y_train)
    
    # Predict the target variable for the test set
    y_pred = clf.predict(X_test_scaled)
    
    # Calculate the accuracy score for this fold
    acc_score = accuracy_score(y_test, y_pred)
    acc_scores.append(acc_score)
    
    # Store the predicted values for this fold
    y_preds.append(y_pred)
    
    print(f'Accuracy score: {acc_score:.4f}\n')

# Compute the average and standard deviation of the accuracy scores across all folds
mean_acc = np.mean(acc_scores)
std_acc = np.std(acc_scores)

print(f'Average accuracy score: {mean_acc:.4f}')
print(f'Standard deviation of accuracy score: {std_acc:.4f}')


Fold 1:
Accuracy score: 1.0000

Fold 2:
Accuracy score: 1.0000

Fold 3:
Accuracy score: 1.0000

Fold 4:
Accuracy score: 1.0000

Fold 5:
Accuracy score: 1.0000

Fold 6:
Accuracy score: 1.0000

Fold 7:
Accuracy score: 1.0000

Fold 8:
Accuracy score: 1.0000

Fold 9:
Accuracy score: 1.0000

Fold 10:
Accuracy score: 1.0000

Average accuracy score: 1.0000
Standard deviation of accuracy score: 0.0000


In [4]:
# with X as normalized values


from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix


# Normalize the explanatory variables
scaler = StandardScaler()
X = scaler.fit_transform(data.iloc[:, :-1])
y = data.iloc[:, -1]

# Set up a dictionary of parameters to define our grid to search over
#parameters = {
#    'kernel': ['linear', 'rbf'],
#    'C': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10],
#    'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]
#}
# We will create an instance of GridSearchCV, thereby passing our estimator and our parameter grid
#clf = GridSearchCV(estimator=SVC(), param_grid=parameters)

# Initialize the SVM model
clf = svm.SVC(C=5, gamma=0.0001, kernel='linear')

# Initialize KFold with 10 folds
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize lists to store accuracy and predictions for each fold
acc_scores = []
y_preds = []

# Loop over each fold
for i, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
    print(f'Fold {i+1}:')
    
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Train the model on the training set
    clf.fit(X_train, y_train)
    #print("Best parameters: {}".format(clf.best_params_))
    
    # Predict the target variable for the test set
    y_pred = clf.predict(X_test)
    
    # Calculate the accuracy score for this fold
    acc_score = accuracy_score(y_test, y_pred)
    acc_scores.append(acc_score)
    
    # Store the predicted values for this fold
    y_preds.append(y_pred)
    
    # Print the accuracy score, along with a confusion matrix to describe the performance of the classifier
    print(f'Accuracy score: {acc_score:.4f}\nConfusion matrix:\n', confusion_matrix(y_test, y_pred))

# Compute the average and standard deviation of the accuracy scores across all folds
mean_acc = np.mean(acc_scores)
std_acc = np.std(acc_scores)

print(f'Average accuracy score: {mean_acc:.4f}')
print(f'Standard deviation of accuracy score: {std_acc:.4f}')


Fold 1:
Accuracy score: 1.0000
Confusion matrix:
 [[16  0]
 [ 0 19]]
Fold 2:
Accuracy score: 1.0000
Confusion matrix:
 [[21  0]
 [ 0 14]]
Fold 3:
Accuracy score: 1.0000
Confusion matrix:
 [[18  0]
 [ 0 17]]
Fold 4:
Accuracy score: 1.0000
Confusion matrix:
 [[18  0]
 [ 0 17]]
Fold 5:
Accuracy score: 1.0000
Confusion matrix:
 [[12  0]
 [ 0 23]]
Fold 6:
Accuracy score: 1.0000
Confusion matrix:
 [[17  0]
 [ 0 17]]
Fold 7:
Accuracy score: 1.0000
Confusion matrix:
 [[14  0]
 [ 0 20]]
Fold 8:
Accuracy score: 1.0000
Confusion matrix:
 [[18  0]
 [ 0 16]]
Fold 9:
Accuracy score: 1.0000
Confusion matrix:
 [[21  0]
 [ 0 13]]
Fold 10:
Accuracy score: 1.0000
Confusion matrix:
 [[14  0]
 [ 0 20]]
Average accuracy score: 1.0000
Standard deviation of accuracy score: 0.0000
