In [None]:
 import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/colab work ML/Data/multi_classification_train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/colab work ML/Data/multi_classification_test.csv')

In [None]:
def f1_score(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0

    f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return f1

def f1_score_multiclass(y_true, y_pred, average='macro'):
    unique_classes = np.unique(y_true)
    f1_scores = []

    for cls in unique_classes:
        y_true_binary = (y_true == cls).astype(int)
        y_pred_binary = (y_pred == cls).astype(int)
        f1_scores.append(f1_score(y_true_binary, y_pred_binary))

    if average == 'macro':
        return np.mean(f1_scores)
    elif average == 'weighted':
        weights = [np.sum(y_true == cls) for cls in unique_classes]
        return np.average(f1_scores, weights=weights)

In [None]:
np.unique(df_train['Class'], return_counts = True)

(array([0, 1, 2, 3, 4]), array([ 4040, 11404, 16618, 10064,  5874]))

In [None]:
X_train_pre = df_train.iloc[:,1:21]
X_train_pre = X_train_pre.to_numpy()
X_train_pre = X_train_pre - np.mean(X_train_pre, axis=0)
X_train_pre = X_train_pre / np.std(X_train_pre, axis=0)
Y_train_pre = df_train.iloc[:,21]
Y_train_pre = Y_train_pre.to_numpy()

KNN

In [None]:

def dist(input, arr):
    return np.sqrt(np.sum((arr - input) ** 2, axis=1))

def knn_classifier(X_train, y_train, X_test, k=3):
    predicted_labels = []
    probabilities = []
    for test_sample in X_test:
        distances = dist(test_sample, X_train)
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        unique_labels, counts = np.unique(k_nearest_labels, return_counts=True)
        most_common_label = unique_labels[np.argmax(counts)]
        most_common_count = np.max(counts)
        probability = most_common_count / k
        predicted_labels.append(most_common_label)
        probabilities.append(probability)
    return np.array(predicted_labels), np.array(probabilities)


In [None]:
X_train = X_train_pre[np.random.randint(0, X_train_pre.shape[0], 1000),:]
Y_train = Y_train_pre[np.random.randint(0, Y_train_pre.shape[0], 1000)]


In [None]:
prediction, _ = knn_classifier(X_train,Y_train,X_train_pre, k = 250)

In [None]:
f1_score_multiclass(Y_train_pre, prediction, average='weighted')

0.2025054087522541

In [None]:
def calculate_gaussian_probability(x, mean, std, epsilon=1e-6):
    exponent = np.exp(-((x - mean) ** 2 / (2 * (std + epsilon) ** 2)))
    return (1 / (np.sqrt(2 * np.pi) * (std + epsilon))) * exponent

def probability_NB(x, y):
    num_features = x.shape[1]
    classes = np.unique(y)
    class_stats = {}

    for c in classes:
        class_stats[c] = []
        for i in range(num_features):
            feature_data = x[y == c, i]
            mean = np.mean(feature_data)
            std = np.std(feature_data)
            class_stats[c].append((mean, std))
    return class_stats

def class_probabilities_NB(x, y):
    classes = np.unique(y)
    p_classes = {}
    class_stats = probability_NB(x, y)

    for c in classes:
        p_classes[c] = np.sum(y == c) / len(y)

    return p_classes, class_stats

def predict_NB(x_test, p_classes, class_stats):
    classes = list(p_classes.keys())
    predictions = np.zeros(len(x_test), dtype=int)

    for idx, x_single in enumerate(x_test):
        class_scores = {}
        for c in classes:
            p_c_given_x = p_classes[c]
            for i, value in enumerate(x_single):
                mean, std = class_stats[c][i]
                p_c_given_x *= calculate_gaussian_probability(value, mean, std)
            class_scores[c] = p_c_given_x
        predictions[idx] = max(class_scores, key=class_scores.get)

    return predictions


In [None]:
p_classes, class_stats = class_probabilities_NB(X_train_pre, Y_train_pre)

In [None]:
f1_score_multiclass(Y_train_pre,predict_NB(X_train_pre, p_classes, class_stats), average = 'weighted')

0.807611556507535

In [None]:
X_test =  df_test.iloc[:,1:21]
X_test = X_test.to_numpy()
X_test = X_test - np.mean(X_test, axis=0)
X_test = X_test / np.std(X_test, axis=0)
X_test.shape

(12000, 20)

In [None]:
predict_NB(X_test, p_classes, class_stats)

array([3, 1, 1, ..., 3, 0, 1])