In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
# Logistic Regression with Polynomial Kernel
class LogisticRegressionPolyKernel:
    def __init__(self, learning_rate=0.01, n_iterations=1000, degree=2):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.degree = degree

    def fit(self, X, y):
        self.X = self._map_features(X)
        self.y = y
        self.weights = np.zeros(self.X.shape[1])

        for _ in range(self.n_iterations):
            h = self._sigmoid(np.dot(self.X, self.weights))
            gradient = np.dot(self.X.T, (h - self.y)) / len(self.y)
            self.weights -= self.learning_rate * gradient

        return self

    def _map_features(self, X):
        X_poly = np.ones((len(X), 1))
        for i in range(1, self.degree + 1):
            for j in range(i + 1):
                new_feature = (X[:, 0] ** (i - j)) * (X[:, 1] ** j)
                X_poly = np.column_stack((X_poly, new_feature))

        return X_poly

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def predict(self, X):
        X = self._map_features(X)
        predictions = self._sigmoid(np.dot(X, self.weights))
        return np.where(predictions >= 0.5, 1, 0)


# Kernelized KNN
class KernelizedKNN:
    def __init__(self, k=3, kernel='linear'):
        self.k = k
        self.kernel = kernel

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for x in X:
            distances = []
            for i, x_train in enumerate(self.X_train):
                if self.kernel == 'linear':
                    distance = np.dot(x_train - x, x_train - x)
                elif self.kernel == 'polynomial':
                    distance = (np.dot(x_train, x) + 1) ** 2  # Polynomial Kernel
                distances.append((i, distance))
            distances.sort(key=lambda x: x[1])
            k_nearest = distances[:self.k]
            k_nearest_labels = [self.y_train[i] for i, _ in k_nearest]
            predictions.append(Counter(k_nearest_labels).most_common(1)[0][0])
        return np.array(predictions)


# Ensemble of Logistic Regression and KNN
class Ensemble:
    def __init__(self, models):
        self.models = models

    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)

    def predict(self, X):
        predictions = []
        for model in self.models:
            predictions.append(model.predict(X))
        predictions = np.array(predictions)
        final_predictions = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return final_predictions


In [6]:
import random
import data_processing as df

def split_data(X, y, train_ratio=0.8, random_seed=42):
    random.seed(random_seed)

    data_size = len(X)
    index = list(range(data_size))
    random.shuffle(index)

    train_size = int(data_size*train_ratio)
    test_size = data_size - train_size

    X_train = X.iloc[index[:train_size]]
    y_train = y.iloc[index[:train_size]]
    X_test = X.iloc[index[train_size:]]
    y_test = y.iloc[index[train_size:]]

    return X_train, X_test, y_train, y_test

X = df.df.drop("is_claim", axis=1)
y = df.df.loc[:, "is_claim"]

X_train, X_test, y_train, y_test = split_data(X, y)
X_tra, X_val, y_tra, y_val = split_data(X_train, y_train)

In [7]:
# Ensemble of Logistic Regression with Polynomial Kernel and KNN models
log_reg_models = [LogisticRegressionPolyKernel(degree=degree) for degree in range(1, 4)]
knn_models = [KernelizedKNN(k=k, kernel=kernel) for k in [3, 5] for kernel in ['linear', 'polynomial']]

ensemble_models = log_reg_models + knn_models
ensemble = Ensemble(ensemble_models)

ensemble.fit(X_tra.values, y_tra.values)
predictions = ensemble.predict(X_val.values)

accuracy = np.mean(predictions == y_val.values)
print(f"Accuracy of validation set: {accuracy}")

Accuracy of validation set: 0.8146438746438747


In [8]:
# Ensemble of Logistic Regression with Polynomial Kernel and KNN models
log_reg_models = [LogisticRegressionPolyKernel(degree=degree) for degree in range(1, 4)]
knn_models = [KernelizedKNN(k=k, kernel=kernel) for k in [3, 5] for kernel in ['linear', 'polynomial']]

ensemble_models = log_reg_models + knn_models
ensemble = Ensemble(ensemble_models)

ensemble.fit(X_train.values, y_train.values)
predictions = ensemble.predict(X_test.values)

accuracy = np.mean(predictions == y_test.values)
print(f"Accuracy of test set: {accuracy}")

Accuracy of test set: 0.8142948308870454
