In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/colab work ML/Data/binary_classification_train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/colab work ML/Data/binary_classification_test.csv')

In [None]:
X_train_pre = df_train.iloc[:,1:21]
Y_train_pre = df_train.iloc[:,21]
X_train_pre.shape, Y_train_pre.shape

((48000, 20), (48000,))

In [None]:
X_test = df_test.iloc[:,1:].values
X_test.shape

(12000, 20)

In [None]:
def f1_score(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0

    f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return f1

def f1_score_multiclass(y_true, y_pred, average='macro'):
    unique_classes = np.unique(y_true)
    f1_scores = []

    for cls in unique_classes:
        y_true_binary = (y_true == cls).astype(int)
        y_pred_binary = (y_pred == cls).astype(int)
        f1_scores.append(f1_score(y_true_binary, y_pred_binary))

    if average == 'macro':
        return np.mean(f1_scores)
    elif average == 'weighted':
        weights = [np.sum(y_true == cls) for cls in unique_classes]
        return np.average(f1_scores, weights=weights)

In [None]:
for i in range(20):
  for j in range(i):
    plt.scatter(X_train_pre.iloc[:,i], X_train_pre.iloc[:,j], c = Y_train_pre, cmap = 'Accent')
    plt.xlabel(f'Feature{i}')
    plt.ylabel(f'Feature{j}')
    plt.tight_layout()
  plt.show()

Output hidden; open in https://colab.research.google.com to view.

Gaussian naive bayes, Separated Naive bayes is not considered because the values are continuous and not individual characteristics

In [None]:

def calculate_gaussian_probability(x, mean, std, epsilon=1e-6):
    exponent = np.exp(-((x - mean) ** 2 / (2 * (std + epsilon) ** 2)))
    return (1 / (np.sqrt(2 * np.pi) * (std + epsilon))) * exponent

def probability(x, y):
    num_features = x.shape[1]
    classes = np.unique(y)
    class_stats = {}

    for c in classes:
        class_stats[c] = []
        for i in range(num_features):
            feature_data = x[y == c, i]
            mean = np.mean(feature_data)
            std = np.std(feature_data)
            class_stats[c].append((mean, std))
    return class_stats

def class_probabilities(x, y):
    classes = np.unique(y)
    p_classes = {}
    class_stats = probability(x, y)

    for c in classes:
        p_classes[c] = np.sum(y == c) / len(y)

    return p_classes, class_stats

def predict(x_test, p_classes, class_stats):
    classes = list(p_classes.keys())
    predictions = np.zeros(len(x_test), dtype=int)

    for idx, x_single in enumerate(x_test):
        class_scores = {}
        for c in classes:
            p_c_given_x = p_classes[c]
            for i, value in enumerate(x_single):
                mean, std = class_stats[c][i]
                p_c_given_x *= calculate_gaussian_probability(value, mean, std)
            class_scores[c] = p_c_given_x
        predictions[idx] = max(class_scores, key=class_scores.get)

    return predictions


In [None]:
indices = np.random.randint(0,48000, 10000)
X_train = X_train_pre.iloc[indices,:].values
Y_train = Y_train_pre.iloc[indices].values

In [None]:
p_classes, class_stats = class_probabilities(X_train, Y_train)


In [None]:
predict(X_train, p_classes, class_stats)

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
f1_score(Y_train, predict(X_train, p_classes, class_stats))

0.8630066700872242

KNN

In [None]:
def dist(input, arr):
    return np.sqrt(np.sum((arr - input) ** 2, axis=1))

def knn_classifier(X_train, y_train, X_test, k=3):
    predicted_labels = []
    probabilities = []
    for test_sample in X_test:
        distances = dist(test_sample, X_train)
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        unique_labels, counts = np.unique(k_nearest_labels, return_counts=True)
        most_common_label = unique_labels[np.argmax(counts)]
        most_common_count = np.max(counts)
        probability = most_common_count / k
        predicted_labels.append(most_common_label)
        probabilities.append(probability)
    return np.array(predicted_labels), np.array(probabilities)

In [None]:
indices = np.random.randint(0,48000, 1000)
X_train = X_train_pre.iloc[indices,:].values
Y_train = Y_train_pre.iloc[indices].values

In [None]:
predicted_labels, _ = knn_classifier(X_train, Y_train, X_train, 10)
f1_score(Y_train,predicted_labels)

0.9023255813953489

SVM (linear)

In [None]:

class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=1, num_iterations=1000, print_gap=100):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.num_iterations = num_iterations
        self.w = None
        self.b = None
        self.print_gap = print_gap

    def hinge_loss(self, x, y):
        distances = 1 - y * (np.dot(x, self.w) + self.b)
        distances = np.maximum(0, distances)
        loss = (1 / 2) * np.dot(self.w, self.w) + self.lambda_param * np.sum(distances)
        return loss

    def fit(self, x, y):
        n_samples, n_features = x.shape
        self.w = np.zeros(n_features)
        self.b = 0

        for i in range(self.num_iterations):
            dw = np.zeros(n_features)
            db = 0
            margin_violations = y * (np.dot(x, self.w) + self.b) < 1
            dw += -np.dot(x.T, (margin_violations * y))
            db += -np.sum(margin_violations * y)
            dw = dw / n_samples + self.lambda_param * self.w
            db = db / n_samples
            self.w -= self.learning_rate * dw
            self.b -= self.learning_rate * db
            if i % self.print_gap == 0:
                current_loss = self.hinge_loss(x, y)
                print(f"Iteration {i}: Loss = {current_loss}")

    def predict(self, x):
        return np.sign(np.dot(x, self.w) + self.b)


In [None]:
svm = SVM(learning_rate=0.1, lambda_param=0.001, num_iterations=80000, print_gap=10000)
svm.fit(X_train_pre,Y_train_pre)

Iteration 0: Loss = 107.36939920488159
Iteration 10000: Loss = 42.196500664384416
Iteration 20000: Loss = 33.998920912528725
Iteration 30000: Loss = 32.8896089992287
Iteration 40000: Loss = 32.73949664148206
Iteration 50000: Loss = 32.71918188037066
Iteration 60000: Loss = 32.71643221622751
Iteration 70000: Loss = 32.71606157081276


In [None]:
f1_score(Y_train_pre,svm.predict(X_train_pre))

0.4830288856583022

Logistic regression

In [None]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

class LogisticRegression:
    def __init__(self, learning_rate=0.01, iterations=1000, verbose=True, print_rate=50):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.verbose = verbose
        self.print_rate = print_rate
        self.weights = None
        self.bias = 0

    def initialize_weights(self, n_features):
        self.weights = np.zeros((n_features, 1))
        self.bias = 0

    def compute_gradients(self, X, y, y_pred):
        m = y.shape[0]
        dw = (1 / m) * np.dot(X.T, (y_pred - y))
        db = (1 / m) * np.sum(y_pred - y)
        return dw, db

    def fit(self, X, y):
        m, n_features = X.shape
        self.initialize_weights(n_features)
        for i in range(self.iterations):
            z = np.dot(X, self.weights) + self.bias
            y_pred = sigmoid(z)
            epsilon = 1e-15
            y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
            cost = -(1 / m) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
            dw, db = self.compute_gradients(X, y, y_pred)
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            if self.verbose and i % self.print_rate == 0:
                print(f"Iteration {i}: Cost = {cost:.6f}")

    def predict(self, X):
        z = np.dot(X, self.weights) + self.bias
        y_pred = sigmoid(z)
        return (y_pred > 0.5).astype(int)

def normalize_features(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return (X - mean) / std


In [None]:
indices = np.random.randint(0,48000, 5000)
X_train = X_train_pre.iloc[indices,:].values
Y_train = Y_train_pre.iloc[indices].values
Y_train = Y_train.reshape(-1, 1)
X_train = X_train - np.mean(X_train, axis=0)
X_train = X_train / np.std(X_train, axis=0)
LGR = LogisticRegression(learning_rate=0.00001, iterations=10000, verbose=True, print_rate=1000)
LGR.fit(X_train,Y_train)

Iteration 0: Cost = 0.693147
Iteration 1000: Cost = 0.689948
Iteration 2000: Cost = 0.686785
Iteration 3000: Cost = 0.683660
Iteration 4000: Cost = 0.680571
Iteration 5000: Cost = 0.677518
Iteration 6000: Cost = 0.674500
Iteration 7000: Cost = 0.671517
Iteration 8000: Cost = 0.668568
Iteration 9000: Cost = 0.665654


In [None]:
f1_score(Y_train_pre.to_numpy(),LGR.predict(X_train_pre.to_numpy()))

0.38647037665800693

Decision Trees

In [None]:

class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def gini_index(self, y):
        classes, counts = np.unique(y, return_counts=True)
        prob = counts / len(y)
        return 1 - np.sum(prob ** 2)

    def gini_index_split(self, x, y, feature_index, threshold):
        left_mask = x[:, feature_index] < threshold
        right_mask = ~left_mask
        left_y, right_y = y[left_mask], y[right_mask]

        if len(left_y) == 0 or len(right_y) == 0:
            return float('inf')

        left_gini = self.gini_index(left_y)
        right_gini = self.gini_index(right_y)
        weighted_gini = (len(left_y) * left_gini + len(right_y) * right_gini) / len(y)
        return weighted_gini

    def best_split(self, x, y):
        best_gini = float('inf')
        best_feature = None
        best_threshold = None

        for feature_index in range(x.shape[1]):
            thresholds = np.unique(x[:, feature_index])
            for threshold in thresholds:
                gini = self.gini_index_split(x, y, feature_index, threshold)
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature_index
                    best_threshold = threshold
                    best_threshold = float(best_threshold)

        return best_feature, best_threshold

    def build_tree(self, x, y, depth=0):
        if len(np.unique(y)) == 1:
            return y[0]

        if len(y) < self.min_samples_split or (self.max_depth is not None and depth >= self.max_depth):
            return np.bincount(y).argmax()

        feature, threshold = self.best_split(x, y)

        if feature is None:
            return np.bincount(y).argmax()

        left_mask = x[:, feature] < threshold
        right_mask = ~left_mask

        left_tree = self.build_tree(x[left_mask], y[left_mask], depth + 1)
        right_tree = self.build_tree(x[right_mask], y[right_mask], depth + 1)

        return {
            'feature_index': int(feature),
            'threshold': float(threshold),
            'left': left_tree,
            'right': right_tree
        }

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict_tree(self, x, tree):
        if isinstance(tree, dict):
            feature_value = float(x[tree['feature_index']])
            if feature_value < float(tree['threshold']):
                return self.predict_tree(x, tree['left'])
            else:
                return self.predict_tree(x, tree['right'])
        else:
            return tree

    def predict(self, X):
        return np.array([self.predict_tree(x, self.tree) for x in X])


In [None]:
indices = np.random.randint(0,48000, 1000)
X_train = X_train_pre.iloc[indices,:].values
Y_train = Y_train_pre.iloc[indices].values
dtc = DecisionTreeClassifier()
dtc.fit(X_train,Y_train)

In [None]:
tree = dtc.tree
tree

{'feature_index': 5,
 'threshold': 100.82768034776626,
 'left': {'feature_index': 17,
  'threshold': -10.905793960206992,
  'left': {'feature_index': 10,
   'threshold': 35.9524127282833,
   'left': {'feature_index': 11,
    'threshold': -5.996420467892605,
    'left': 1,
    'right': 0},
   'right': 0},
  'right': {'feature_index': 9,
   'threshold': 169.7695419363346,
   'left': {'feature_index': 5,
    'threshold': 37.25122158477816,
    'left': {'feature_index': 6,
     'threshold': -70.00102049160685,
     'left': 1,
     'right': {'feature_index': 17,
      'threshold': 261.6778987846478,
      'left': {'feature_index': 7,
       'threshold': -17.10404732226862,
       'left': {'feature_index': 1,
        'threshold': -206.5637991004482,
        'left': {'feature_index': 0,
         'threshold': -24.829580715648405,
         'left': 1,
         'right': 0},
        'right': {'feature_index': 19,
         'threshold': -70.73796221696509,
         'left': {'feature_index': 11,
    

In [None]:
f1_score(Y_train_pre.to_numpy(),(dtc.predict(X_train_pre.to_numpy())))

0.7823874755381606

KNN works best

In [None]:
final_prediction, _ = knn_classifier(X_train, Y_train, X_test, 10)