Project by:
- Jack Chen 4427737
- Joost Litjes 4540700
- Felicia Hung 7568479

In [125]:
import numpy as np
import pandas as pd

import os

import sklearn

from scipy import stats

import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

In [126]:
px.defaults.width = 600
px.defaults.height = 600

Task 1

In [127]:
db = pd.read_csv("blood_transfusion.csv")
db.describe()

Unnamed: 0,months_since_last_donation,total_number_of_donations,total_blood_donated,months_since_first_donation,class
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


Task 2

In [128]:
numeric_features = [
    "months_since_last_donation",
    "total_number_of_donations",
    "total_blood_donated",
    "months_since_first_donation",
]

categorical_features = [
    "class",
]

In [129]:
def exportImage(plot, name):
    pio.write_html(plot, os.path.join("plots", name + '.html'))
    
    # Change if you want to print plots !!
    # fig.show()

# Manual normalization function
def normalize_column(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

for column in numeric_features:
    db[column] = normalize_column(db[column])


In [130]:
db = db.astype({col: str for col in db.columns if col in categorical_features})

for column in numeric_features:
    db[column] = normalize_column(db[column])

class_0_df = db[db['class'] == "0"]
class_1_df = db[db['class'] == "1"]

In [131]:
fig = go.Figure()
for i, column in enumerate(numeric_features):
    fig.add_trace(
        go.Box(x=db['class'], 
        y=db[column], 
        name=column),
    )

fig.update_layout(
    boxmode='group',
    width=len(numeric_features)*200, height=400, title_text="Comparing trends between class 0 and class 1 for Numeric Features")
exportImage(fig, "Comparing trends between class 0 and class 1 for Numeric Features")

In [132]:
 for df_name, data in zip(["class 0", "class 1"], [class_0_df, class_1_df]):
    fig = make_subplots(rows=len(numeric_features), cols=len(numeric_features))

    for i, feature_to_plot_y in enumerate(numeric_features):
        for j, feature_to_plot_x in enumerate(numeric_features):
            trace = go.Scatter(x=data[feature_to_plot_x], y=data[feature_to_plot_y], text="", mode='markers', showlegend=False)
            fig.add_trace(trace, row=j+1, col=i+1)

    # Add x and y labels to the subplots
    for i, feature in enumerate(numeric_features):
        fig.update_xaxes(title_text=feature, row=len(numeric_features), col=i+1)
        fig.update_yaxes(title_text=feature, row=i+1, col=1)

    fig.update_layout(height=len(numeric_features)*250, width=len(numeric_features)*250, title_text=f"Comparing feature relations with {df_name}")
    exportImage(fig, f"Comparing feature relations with {df_name}")

In [133]:
# Convert categorical features to numerical using one-hot encoding
data_encoded = pd.get_dummies(db, columns=['class'], drop_first=False)

# Calculate the correlation matrix for dummified categorical features
correlation_matrix_categorical = list(data_encoded[
    data_encoded.columns.difference(numeric_features)
].columns)

correlation_matrix = data_encoded[numeric_features + correlation_matrix_categorical].corr()

numeric_features_indexes = [correlation_matrix.columns.get_loc(col) for col in numeric_features]
categorical_features_indexes = [correlation_matrix.columns.get_loc(col) for col in correlation_matrix_categorical]

data = correlation_matrix.iloc[numeric_features_indexes, numeric_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Numeric Features", y="Numeric Features", color="Correlation"),
    title="Correlation Heatmap of Numerical Features",
)
exportImage(fig, "Correlation Heatmap of Numerical Features")

data = correlation_matrix.iloc[numeric_features_indexes, categorical_features_indexes]
fig = px.imshow(
    data,
    labels=dict(x="Numeric Features", y="Categorical Features", color="Correlation"),
    title="Correlation Heatmap of Numerical vs Categorical Features",
)
exportImage(fig, "Correlation Heatmap of Numerical vs Categorical Features")


Task 3

In [134]:
from sklearn.model_selection import train_test_split

def split_data(db, train_features, label_feature, test_size, seed = 101):
    return train_test_split(db[train_features].to_numpy(), db[label_feature].to_numpy(), test_size=test_size, random_state=seed)

In [135]:
class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def euclidean_distance(self, x1, x2):
        return np.linalg.norm(x1 - x2)

    def predict(self, X):
        # Make predictions for an array of data points X
        y_pred = []

        for x in X:
            distances = []

            for i in range(len(self.X_train)):
                distance = self.euclidean_distance(x, self.X_train[i])
                distances.append((distance, self.y_train[i]))

            # Sort the distances and select the k-nearest neighbors
            distances.sort(key=lambda x: x[0])
            neighbors = distances[:self.k]

            # Count the votes from the k-nearest neighbors
            class_votes = {}
            for neighbor in neighbors:
                label = neighbor[1]
                if label in class_votes:
                    class_votes[label] += 1
                else:
                    class_votes[label] = 1

            # Return the class with the most votes as the prediction
            predicted_class = max(class_votes, key=class_votes.get)
            y_pred.append(predicted_class)

        return y_pred

In [136]:
def confusion_matrix(y_true, y_pred):
    TP = TN = FP = FN = 0
    for true, pred in zip(y_true, y_pred):
        true = int(true)
        pred = int(pred)
        if true == 1:
            if pred == 1:
                TP += 1
            else:
                FN += 1
        else:
            if pred == 1:
                FP += 1
            else:
                TN += 1

    return {
        "TP": TP,
        "TN": TN,
        "FP": FP,
        "FN": FN
    }

from sklearn.metrics import classification_report
from sklearn.metrics import fbeta_score

In [137]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import confusion_matrix

import random

def classifierTests(db, numeric_features, classifiers, label_feature, test_sizes):
    results = {}
    for n in test_sizes:
        # Split the alternative dataset
        seed = random.randint(0,101)
        X_train, X_test, y_train, y_test = split_data(db, numeric_features, "class", n, seed)

        for classifier in classifiers:
            # Train the classifier on the main dataset
            classifier.fit(X_train, y_train)

            # Predict on the alternative dataset
            y_pred = classifier.predict(X_test)

            # Compare the predicted labels with the actual labels
            conf_matrix = confusion_matrix(y_test, y_pred)
            class_report = classification_report(y_test, y_pred, output_dict=True)
            fbeta = fbeta_score(y_test, y_pred, average='macro', beta=0.5)

            if classifier.__class__.__name__ not in results:
                results[classifier.__class__.__name__] = [] 
            
            results[classifier.__class__.__name__].append({
                'Classifier': classifier.__class__.__name__,
                'Test Size': n,
                'Confusion Matrix': conf_matrix,
                'Classification Report': class_report,
                'F-Beta Score': fbeta,
            })

    return results


In [138]:
import json

# Define the list of classifiers

classifiers = [
    KNNClassifier(),
    GaussianNB(),
    SVC(kernel='linear', random_state=101),
    MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000),
    ]

test_sizes = [
        0.15,
        0.35, 
        0.50, 
        0.65, 
        0.85
        ]

results = classifierTests(db, numeric_features, classifiers, "class", test_sizes)


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with n

In [159]:
import inspect
from sklearn.model_selection import KFold

def classifierTests(db, numeric_features, classifiers, label_feature, n_splits):
    results = {}
    
    kf = KFold(n_splits=n_splits, random_state=None, shuffle=False)

    for i, (train_index, test_index) in enumerate(kf.split(db[numeric_features])):
        X_train, X_test = db[numeric_features].iloc[train_index].to_numpy(), db[numeric_features].iloc[test_index].to_numpy()
        y_train, y_test = db[label_feature].iloc[train_index].to_numpy(), db[label_feature].iloc[test_index].to_numpy()

        for classifier in classifiers:
            classifier.fit(X_train, y_train)
            y_pred = classifier.predict(X_test)

            conf_matrix = confusion_matrix(y_test, y_pred)
            class_report = classification_report(y_test, y_pred, output_dict=True)
            fbeta = fbeta_score(y_test, y_pred, average='macro', beta=0.5)
            
            if classifier.__class__.__name__ not in results:
                results[classifier.__class__.__name__] = []

            attributes = {}
            for attr_name, attr_value in inspect.getmembers(classifier):
                if isinstance(attr_value, (int, float, str)):
                    if attr_name != "__module__":
                        attributes[attr_name] = attr_value
            
            results[classifier.__class__.__name__].append({
                'classifier': classifier.__class__.__name__,
                'fold': i,
                'parameters': attributes,
                'confusion_matrix': conf_matrix,
                'classification_report': class_report,
                'f_beta_score': fbeta,
            })

    return results

In [163]:
classifiers = [
    ]

for k in range(2,7):
    classifiers.append(KNNClassifier(k))

results = classifierTests(db, numeric_features, classifiers, "class", 4)
result_df = pd.DataFrame()
for key in results:
    flattened_df = pd.json_normalize(results[key], sep='_')
    result_df = result_df.append(flattened_df, ignore_index=True)
result_df.head()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,classifier,fold,f_beta_score,parameters_k,confusion_matrix_TP,confusion_matrix_TN,confusion_matrix_FP,confusion_matrix_FN,classification_report_0_precision,classification_report_0_recall,...,classification_report_1_support,classification_report_accuracy,classification_report_macro avg_precision,classification_report_macro avg_recall,classification_report_macro avg_f1-score,classification_report_macro avg_support,classification_report_weighted avg_precision,classification_report_weighted avg_recall,classification_report_weighted avg_f1-score,classification_report_weighted avg_support
0,KNNClassifier,0,0.55039,2,41,63,48,35,0.642857,0.567568,...,76,0.55615,0.551766,0.553521,0.54992,187,0.568815,0.55615,0.559831,187
1,KNNClassifier,0,0.562986,3,40,67,44,36,0.650485,0.603604,...,76,0.572193,0.563338,0.56496,0.563084,187,0.579649,0.572193,0.574891,187
2,KNNClassifier,0,0.548961,4,40,64,47,36,0.64,0.576577,...,76,0.55615,0.549885,0.551446,0.548716,187,0.566751,0.55615,0.559557,187
3,KNNClassifier,0,0.591727,5,37,77,34,39,0.663793,0.693694,...,76,0.609626,0.59246,0.590268,0.590908,187,0.605811,0.609626,0.607286,187
4,KNNClassifier,0,0.603019,6,38,78,33,38,0.672414,0.702703,...,76,0.620321,0.603813,0.601351,0.602116,187,0.616652,0.620321,0.618045,187


In [164]:
score_groups = [
    [
        'confusion_matrix_TP',
        'confusion_matrix_TN',
        'confusion_matrix_FP',
        'confusion_matrix_FN',
    ],
    [
        'classification_report_0_precision',
        'classification_report_0_recall',
        'classification_report_0_f1-score',
        'classification_report_1_precision',
        'classification_report_1_recall',
        'classification_report_1_f1-score',
        'f_beta_score',
    ],
    [
        'classification_report_accuracy',
        'classification_report_macro avg_precision',
        'classification_report_macro avg_recall',
        'classification_report_macro avg_f1-score',
        'classification_report_weighted avg_precision',
        'classification_report_weighted avg_recall',
        'classification_report_weighted avg_f1-score',
        'f_beta_score',
    ]
]

In [165]:
result_df = result_df.groupby('parameters_k').mean().reset_index()
for yList in score_groups:
    fig = px.line(result_df, x='parameters_k', y=yList,
                markers=True, title='Scores by Parameters K',
                labels={'value': 'Score'})
    exportImage(fig, f'Scores by Parameters K {yList[0]}')

In [167]:
results = classifierTests(db, numeric_features, [KNNClassifier(k=3)], "class", 4)
result_df = pd.DataFrame()
for key in results:
    flattened_df = pd.json_normalize(results[key], sep='_')
    result_df = result_df.append(flattened_df, ignore_index=True)

result_df = result_df.groupby('fold').mean().reset_index()
for yList in score_groups:
    fig = px.line(result_df, x='fold', y=yList,
                markers=True, title='Scores by Fold',
                labels={'value': 'Score'})
    exportImage(fig, f'Scores by Fold {yList[0]}')


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

