In [None]:
#from numpy.core.numeric import infty
!pip install numpy
!pip install pandas
!pip install plotly

!pip install scikit-learn # non-depreceated sklearn

TASK 1

In [None]:
import pandas as pd
# read the csv and make it a pandas.DataFrame
frame = pd.DataFrame(pd.read_csv("./blood_transfusion.csv"))
frame.describe(include='all') # 5 columns, 4 numerical, class is nominal (1 or 0)

In [None]:
# TODO: visualise, understand, story

TASK 2

In [None]:
preprocessed = frame.copy() 
preprocessed = (preprocessed-preprocessed.min())/(preprocessed.max()-preprocessed.min()) # Normalization
# It also normalizes the class, but because class is either 1 or 0, the formula does not change the class 
preprocessed.describe()

TASK 3

In [None]:
from sklearn.model_selection import train_test_split
X_train_s, X_test_s, Y_train_s, Y_test_s = train_test_split(
    preprocessed.drop('class', axis=1), preprocessed['class'], test_size=0.33, random_state=1) # small test_size
X_train_l, X_test_l, Y_train_l, Y_test_l = train_test_split(
    preprocessed.drop('class', axis=1), preprocessed['class'], test_size=0.66, random_state=1) # large test_size
# random state for reproducibility

TASK 4 & 5

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import fbeta_score

#5.1 Confusion matrix
def confusionmatrix(predicted_y, real_y): # it works, im confused :) \j
    """
    This function calculates and prints the confusion matrix.
    :param predicted_y: list: predicted classes for the test data 
    :param real_y: list: real classes of the test data
    :return: prints the confusion matrix, returns nothing
    """
    tp, fp, tn, fn = 0, 0, 0, 0
    for index,_ in enumerate(predicted_y):
        if predicted_y[index] == 1: 
            if predicted_y[index] == real_y[index]: 
                tp += 1
            else:
                fp += 1
        elif predicted_y[index] == 0:
            if predicted_y[index] == real_y[index]: 
                tn += 1
            else:
                fn += 1
    print(f"\t P \t\t N \n T \t {tp} \t\t {tn} \n F \t {fp} \t\t {fn} \n")

def accuracy_checker(X_test, y_test, pred, mode="basic"):
    """
    This function checks the accuracy of the predicted classes against the real classes.
    :param X_test: pandas.DataFrame: the test data to be classified
    :param y_test: pandas.Series: the real classes of the test data
    :param pred: predictor class: the method used to classify the test data
    :param mode: string: way to evaluate the results
    :return: prints the results, returns nothing
    """
    num, total = 0, 0
    predicted, true = [], []
    for index, row in X_test.iterrows():
        p = pred.predict(X_test.loc[[index]])
        q = list(y_test.loc[[index]])
        if p == q:
            num += 1
        total += 1
        predicted.append(p)
        true.append(q)
    
    if mode == "basic":
        print(f"Accuracy: {num/total*100:.2f}%")
    elif mode == "classification_report":
        print(classification_report(true, predicted))
    elif mode == "confusion_matrix":
        confusionmatrix(true, predicted)
    elif mode == "fbeta":
        print("average=macro:", fbeta_score(true, predicted, average='macro', beta=0.5))
        print("average=weighted:", fbeta_score(true, predicted, average='weighted', beta=0.5))
        print("average=None:", fbeta_score(true, predicted, average=None, beta=0.5))

# 4.1 KNN
class KNN:    
    def __init__(self, k=5):
        """
        initializes an instance of the KNN class
        :param k: int: number of neighbors
        """
        self.k = k
        self.X_train = None
        self.y_train = None
        
    def fit(self, X, y):
        """
        fits the KNN class
        :param X: pandas.DataFrame: the train data
        :param y: pandas.Series: the classes of the train data
        :return: a fitted instance of the KNN class
        """
        self.X_train = X
        self.y_train = y
        return self
    
    def euclidean_distance(self, a, b):
        """
        This function calculates the euclidean distance between two datapoints.
        :param a: list: the first datapoint
        :param b: list: the second datapoint
        :return: float: the euclidean distance between the datapoints
        """
        distance = 0
        for index, _ in enumerate(a):
            distance += (a[index] - b[index]) ** 2
        distance = distance**0.5
        return distance    
    
    def predict(self, X_test):
        """
        predicts the class of the test data
        :param X_test: pandas.DataFrame: the test data
        :return: list: the predicted classes
        """
        test_y = list([0 for _ in range(X_test.shape[0])])
        for index, (_,test_row) in enumerate(X_test.iterrows()):
            distances = []
            for _, train_row in self.X_train.iterrows():
                test_values = list(test_row)  
                train_values = list(train_row)
                distance = self.euclidean_distance(test_values, train_values)
                distances.append(distance)
            nearest_neighbours = sorted(range(len(distances)), key=lambda i: distances[i])[:self.k]
            neighbor_labels = self.y_train.iloc[nearest_neighbours]
            predicted_label = round(neighbor_labels.mean())
            test_y[index] = predicted_label
        return test_y

# from accuracy it appears that k=5 is best for small set and k=[2,4,9] are all best for large set, for 1<=k<=9
knn_s = KNN(5)
knn_s.fit(X_train_s, Y_train_s)
print("For class knn:")
accuracy_checker(X_test_s, Y_test_s, knn_s)

# 4.2 nbc
from sklearn.naive_bayes import GaussianNB
nbc = GaussianNB()
nbc.fit(X_train_s, Y_train_s)
print("For nbc:")
accuracy_checker(X_test_s, Y_test_s, nbc)

# 4.3 svc
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
svc = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc.fit(X_train_s, Y_train_s)
print("For svc:")
accuracy_checker(X_test_s, Y_test_s, svc)

# 4.4 mlp
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=1, max_iter=250).fit(X_train_s, Y_train_s)
print("For mlp:")
accuracy_checker(X_test_s, Y_test_s, mlp)

In [None]:
knn_s = KNN(5)
knn_s.fit(X_train_s, Y_train_s)
knn_l = KNN(5)
knn_l.fit(X_train_l, Y_train_l)
preds = [nbc, svc, mlp, knn_s]
modes = ["confusion_matrix", "classification_report", "fbeta"]
for mode in modes:
    for pred in preds:
        pred_string = ""
        if pred == svc: pred_string = "svc"
        elif pred == mlp: pred_string = "mlp"
        elif pred == nbc: pred_string = "nbc"
        elif pred == knn_s or pred == knn_l: pred_string = "knn"
        print(f"{mode} for {pred_string} with small test data size:")
        accuracy_checker(X_test_s, Y_test_s, pred, mode=mode)            
        print(f"\n{mode} for {pred_string} with large test data size:")
        accuracy_checker(X_test_l, Y_test_l, pred, mode=mode)
        print("-----------------------------------------")