## Supervised Classification

Popular supervised ML techniques for classification were used (Decision trees, K-Nearest Neighbors, Naïve Bayes, and support vector machines). Several datasets were used from the UCI Machine Learning Repository (https://archive.ics.uci.edu/ml/).

In [1]:
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
datasets = ['breast_cancer','hyperthyroidism','cervical_cancer','liver_cancer']

In [3]:
classifiers = {
    "Decision-Tree": DecisionTreeClassifier(max_depth=5, random_state=1),
    "K-NN": KNeighborsClassifier(n_neighbors=3),
    "Naive-Bayes": GaussianNB(),
    "SVM": SVC(kernel='rbf', gamma=2, C=1, random_state=1)
}

### Load Data & Preprocessing

In [4]:
def read_data(dataset_id):

    if dataset_id == 'breast_cancer':
        data = pd.read_csv('wdbc.data', header=None)
        
    elif dataset_id == 'hyperthyroidism':
        data = pd.read_csv('allhyper.data', header=None).append(pd.read_csv('allhyper.test', header=None), ignore_index=True)
        data[data.columns[-1]] = data[data.columns[-1]].str.split('.').str[0]
        
    elif dataset_id == 'cervical_cancer':
        data = pd.read_csv('risk_factors_cervical_cancer.csv')
        data.drop(['Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology'], axis=1, inplace=True)
        
    else:
        data = pd.read_csv('Indian Liver Patient Dataset (ILPD).csv', header=None)
        
    data = data.apply(pd.to_numeric, errors='coerce').fillna(data)    
    data.replace('?', float('nan'), inplace=True)
    
    return data

### Return Outcome Values

A set containing all possible outcome values and distribution.

In [5]:
def outcome_values(dataset_id, dataset):
    
    if dataset_id == 'breast_cancer':
        values = dataset[1].value_counts().to_dict()
    else:
        values = dataset[dataset.columns[-1]].value_counts().to_dict()
    
    return values

### Model Training

In [6]:
def classify(dataset_id, dataset):
    
    # Additional preprocessing - Binarize discrete values
    if dataset_id == 'hyperthyroidism':
        new_dataset = pd.concat([dataset.iloc[:,:-2], pd.get_dummies(dataset.iloc[:,-2]), dataset.iloc[:,-1]], axis=1)
        new_dataset.replace({'F': 0, 'M': 1, 'f': 0, 't': 1}, inplace=True)
        X = new_dataset.iloc[:, :-1].values.tolist()
        y = new_dataset.iloc[:, -1:].values.tolist()
    elif dataset_id == 'liver_cancer':
        new_dataset = dataset.replace({'Female': 0, 'Male': 1})
        X = new_dataset.iloc[:, :-1].values.tolist()
        y = new_dataset.iloc[:, -1:].values.tolist()
    elif dataset_id == 'breast_cancer':
        X = dataset.drop(1, axis=1).values.tolist()
        y = dataset[1].values.tolist()
    else:
        X = dataset.iloc[:, :-1].values.tolist()
        y = dataset.iloc[:, -1:].values.tolist()
    
    # Split dataset to training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Impute missing values
    imputer = SimpleImputer(missing_values=float('nan'), strategy='mean')
    imputer.fit(X_train)
    X_train = imputer.transform(X_train)
    X_test = imputer.transform(X_test)
    
    # Training & testing
    model.fit(X_train, y_train)
    
    # Predictions
    predictions = model.predict(X_test).tolist()
    actual = y_test
    
    return predictions, actual

### Evaluation

In [7]:
def evaluate(labels, gold, predictions):
    evaluation = {}
    evaluation['accuracy'] = accuracy_score(gold, predictions)
    
    for l in labels:
        try:
            evaluation[l] = {'precision': round(precision_score(gold, predictions, pos_label=l), 2),
                              'recall': round(recall_score(gold, predictions, pos_label=l), 2),
                              'f1': round(f1_score(gold, predictions, pos_label=l), 2)}
        except: # multi-class
            evaluation[l] = {'precision': round(precision_score(gold, predictions, average='macro', pos_label=l), 2),
                             'recall': round(recall_score(gold, predictions, average='macro', pos_label=l), 2),
                             'f1': round(f1_score(gold, predictions, average='macro', pos_label=l), 2)}
            
    return evaluation

In [8]:
for dataset_id in datasets:
    data = read_data(dataset_id)
    
    print('* ' * 64)
    print('DATASET: %s' % dataset_id)
    labels = outcome_values(dataset_id, data)
    print('LABELS: ', labels)
    
    for _, model in classifiers.items():
        res = classify(dataset_id, data)
        pred = res[0]
        actual = res[1]
        print(_ + ' ' + str(evaluate(labels, actual, pred)), '\n')

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
DATASET: breast_cancer
LABELS:  {'B': 357, 'M': 212}
Decision-Tree {'accuracy': 0.9473684210526315, 'B': {'precision': 0.96, 'recall': 0.96, 'f1': 0.96}, 'M': {'precision': 0.94, 'recall': 0.94, 'f1': 0.94}} 

K-NN {'accuracy': 0.7894736842105263, 'B': {'precision': 0.76, 'recall': 0.94, 'f1': 0.84}, 'M': {'precision': 0.87, 'recall': 0.57, 'f1': 0.69}} 

Naive-Bayes {'accuracy': 0.5964912280701754, 'B': {'precision': 0.59, 'recall': 0.99, 'f1': 0.74}, 'M': {'precision': 0.67, 'recall': 0.04, 'f1': 0.08}} 

SVM {'accuracy': 0.5877192982456141, 'B': {'precision': 0.59, 'recall': 1.0, 'f1': 0.74}, 'M': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}} 

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
DATASET: hyperthyroidism
LABELS:  {'negative': 3670, 'hyperthyroid': 79, 'goitre': 12,