In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from tqdm import tqdm
import seaborn as sns
from tabulate import tabulate

In [2]:
seed = 42
np.random.seed(seed)

## Logistic Regression, Ada-boost and Metrics

In [3]:
class LogisticRegression:
    def __init__(self, learning_rate=0.02, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        # self.weights = np.zeros(n_features)
        self.weights = np.random.rand(n_features)
        self.bias = 0

        for _ in range(self.n_iterations) :
            # z = np.dot(X, self.weights) + self.bias
            z = np.array((np.dot(X, self.weights) + self.bias), dtype=float)
            y_hat = self.sigmoid(z)

            # Gradient Descent
            dw = (1/n_samples) * np.dot(X.T, (y_hat - y))
            db = (1/n_samples) * np.sum(y_hat - y)

            self.weights = self.weights - self.learning_rate * dw
            self.bias = self.bias - self.learning_rate * db

    def predict(self, X):
        z = np.array((np.dot(X, self.weights) + self.bias), dtype=float)
        predictions = self.sigmoid(z)
        return np.round(predictions)

In [4]:
class AdaBoost:
    def __init__(self, n_hypotheses=100):
        self.n_hypotheses = n_hypotheses
        self.models = []
        self.model_weights = []

    def resample(self, X, y, weights):
        n_samples = X.shape[0]
        sample_indices = np.random.choice(n_samples, n_samples, replace=True)

        X_sampled = X[sample_indices]
        Y_sampled = y[sample_indices]
        weights_sampled = weights[sample_indices]

        return X[sample_indices], y[sample_indices], weights[sample_indices]

    def fit(self, X, y):
        n_samples = X.shape[0]
        weights = np.ones(n_samples) / n_samples

        for _ in tqdm(range(self.n_hypotheses)):
            X_sampled, y_sampled, weights_sampled = self.resample(X, y, weights)
            weak_model = LogisticRegression()
            weak_model.fit(X_sampled, y_sampled)
            self.models.append(weak_model)

            error = 0
            y_hat = weak_model.predict(X)

            error = np.sum(weights[y_hat != y])

            # Ensure the error is not exactly 0 or 1 to avoid division by zero
            error = max(error, 1e-10)
            error = min(error, 1 - 1e-10)

            if error > 0.5:
                continue

            weights[y_hat == y] *= error / (1 - error)

            # normalize weights
            weights /= np.sum(weights)

            self.model_weights.append(np.log((1 - error) / error))

    def predict(self, X):
        # Combine predictions of weak learners using weighted sum
        predictions = np.sum(alpha * model.predict(X) for alpha, model in zip(self.model_weights, self.models))
        return np.sign(predictions)


In [5]:
def calculate_accuracy(y_true, y_pred):
    correct_predictions = np.sum(y_true == y_pred)
    total_predictions = len(y_true)
    accuracy = (correct_predictions / total_predictions)*100
    return accuracy.round(2)

def calculate_true_positive_rate(y_true, y_pred):
    true_positives = np.sum((y_true == 1) & (y_pred == 1))
    actual_positives = np.sum(y_true == 1)
    true_positive_rate = (true_positives / actual_positives)*100
    return true_positive_rate.round(2)

def calculate_true_negative_rate(y_true, y_pred):
    true_negatives = np.sum((y_true == 0) & (y_pred == 0))
    actual_negatives = np.sum(y_true == 0)
    true_negative_rate = (true_negatives / actual_negatives)*100
    return true_negative_rate.round(2)

def calculate_positive_predictive_value(y_true, y_pred):
    true_positives = np.sum((y_true == 1) & (y_pred == 1))
    predicted_positives = np.sum(y_pred == 1)
    positive_predictive_value = (true_positives / predicted_positives)*100
    return positive_predictive_value.round(2)

def calculate_false_discovery_rate(y_true, y_pred):
    false_positives = np.sum((y_true == 0) & (y_pred == 1))
    predicted_positives = np.sum(y_pred == 1)
    false_discovery_rate = (false_positives / predicted_positives)*100
    return false_discovery_rate.round(2)

def calculate_f1_score(y_true, y_pred):
    true_positives = np.sum((y_true == 1) & (y_pred == 1))
    false_positives = np.sum((y_true == 0) & (y_pred == 1))
    false_negatives = np.sum((y_true == 1) & (y_pred == 0))
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = (2 * (precision * recall) / (precision + recall))*100
    return f1_score.round(2)

In [6]:
def print_performance_measures(y_true, y_pred):
    results = np.zeros((1, 6))  # 1 row, 6 columns

    results[0, 0] = calculate_accuracy(y_true, y_pred)
    results[0, 1] = calculate_true_positive_rate(y_true, y_pred)
    results[0, 2] = calculate_true_negative_rate(y_true, y_pred)
    results[0, 3] = calculate_positive_predictive_value(y_true, y_pred)
    results[0, 4] = calculate_false_discovery_rate(y_true, y_pred)
    results[0, 5] = calculate_f1_score(y_true, y_pred)

    results_str = np.char.add(results.astype(str), '%')

    # Print the results in a table format
    column_names = ['Accuracy', 'Sensitivity (TPR)', 'Specificity (TNR)', 'Precision', 'FDR', 'F1_score']
    print(tabulate(results_str, headers=column_names, tablefmt='grid'))

In [7]:
def RunLogisticRegression(df_train, df_train_y, df_test, df_test_y):
  logistic_regression = LogisticRegression()
  logistic_regression.fit(df_train.to_numpy(), df_train_y.to_numpy())
  y_predict = logistic_regression.predict(df_test.to_numpy())
  print('\n')
  print_performance_measures(df_test_y.to_numpy(), y_predict)

In [8]:
def RunAdaBoostAlgo(df_train, df_train_y, df_test, df_test_y):
  for K in [5, 10, 15, 20]:
    adaboost = AdaBoost(K)
    adaboost.fit(df_train.to_numpy(), df_train_y.to_numpy())
    y_predict = adaboost.predict(df_test.to_numpy())
  # print('\n')
  # print("K =", K, ": F1_Score =", calculate_f1_score(df_test_y.to_numpy(), y_predict), "% \t Accuracy =", calculate_accuracy(df_test_y.to_numpy(), y_predict), "%")
    print(K, ": ", calculate_accuracy(df_test_y.to_numpy(), y_predict), "%")

## Telco Dataset

In [9]:
def load_and_split_churn_dataset(filename):
  df = pd.read_csv(filename)
  df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
  return df_train, df_test

In [10]:
def churn_dataset_preprocessing(df):
  df.drop('customerID', axis='columns', inplace=True)

  # Change TotalCharges to numeric and NaN values filled with mean
  df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
  numeric_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
  df['TotalCharges'] = numeric_imputer.fit_transform(df[['TotalCharges']])

  #Standardizing numeric columns
  standard_scaler = StandardScaler()

  for key in ['tenure', 'MonthlyCharges', 'TotalCharges']:
      df[key] = standard_scaler.fit_transform(df[[key]])

  # Identify categorical features
  categorical_features = [col for col in df.columns if col not in df.describe().columns]

  # Identify binary categorical features
  binary_categorical_features = [col for col in categorical_features if df[col].nunique() == 2]

  # Identify non-binary categorical features
  non_binary_categorical_features = [col for col in categorical_features if col not in binary_categorical_features]

  # Apply label encoding for binary categorical features
  label_encoder = LabelEncoder()

  for feature in binary_categorical_features :
    df[feature] = label_encoder.fit_transform(df[feature])

  # Apply one-hot encoding using get_dummies for non-binary categorical features
  df = pd.get_dummies(df, columns=non_binary_categorical_features)

  df_y = df['Churn']
  df.drop(['Churn'], inplace=True, axis=1)

  return df, df_y

In [16]:
df_train, df_test = load_and_split_churn_dataset('./datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df_train, df_train_y = churn_dataset_preprocessing(df_train)
df_test, df_test_y = churn_dataset_preprocessing(df_test)

In [None]:
RunLogisticRegression(df_train, df_train_y, df_test, df_test_y)
RunAdaBoostAlgo(df_train, df_train_y, df_test, df_test_y)

## Adult Dataset

In [22]:
def load_adult_dataset():
  df_train = pd.read_csv('./datasets/adult.data', header=None)
  columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital_status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
  df_train.columns = columns
  df_test = pd.read_csv('./datasets/adult.test', skiprows=[0])
  df_test.columns = columns
  return df_train, df_test

In [23]:
def adult_dataset_common_preprocessing(df):

  categorical_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

  for key in ['workclass', 'occupation', 'native-country']:
      df[key].replace({' ?': np.nan}, inplace=True)
      categorical_imputer.fit(df[[key]])
      df[[key]] = categorical_imputer.transform(df[[key]])

  # standardizing specific columns in data_frame
  std_scaler = StandardScaler()

  for key in ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']:
      df[key] = std_scaler.fit_transform(df[[key]])

  # Identify categorical features
  categorical_features = [col for col in df.columns if col not in df.describe().columns]

  # Identify binary categorical features
  binary_categorical_features = [col for col in categorical_features if df[col].nunique() == 2]

  # Identify non-binary categorical features
  non_binary_categorical_features = [col for col in categorical_features if col not in binary_categorical_features]

  # Apply label encoding for binary categorical features
  label_encoder = LabelEncoder()

  for feature in binary_categorical_features :
    df[feature] = label_encoder.fit_transform(df[feature])

  # Apply one-hot encoding using get_dummies for non-binary categorical features
  df = pd.get_dummies(df, columns=non_binary_categorical_features)

  # separate target column
  df_y = df['income']
  df.drop(['income'], inplace=True, axis=1)

  return df, df_y


In [24]:
def adult_dataset_preprocessing(df_train, df_test):

  # first do the common preprocessing for both training and test data
  df_train, df_train_y = adult_dataset_common_preprocessing(df_train)
  df_test, df_test_y = adult_dataset_common_preprocessing(df_test)

  df_train.drop(['native-country_ Holand-Netherlands'], inplace=True, axis=1)

  return df_train, df_train_y, df_test, df_test_y


In [25]:
adult_df_train, adult_df_test = load_adult_dataset()
adult_df_train, adult_df_train_y, adult_df_test, adult_df_test_y = adult_dataset_preprocessing(adult_df_train, adult_df_test)


In [None]:
RunLogisticRegression(adult_df_train, adult_df_train_y, adult_df_test, adult_df_test_y)
RunAdaBoostAlgo(adult_df_train, adult_df_train_y, adult_df_test, adult_df_test_y)

## Credit Card Dataset

Before running the code, make sure to download the dataset from [Kaggle](https://www.kaggle.com/mlg-ulb/creditcardfraud) and include it in the *datasets* folder.

In [26]:
def cc_dataset_load_and_preprocessing():
  df = pd.read_csv('./datasets/creditcard.csv')
  # Time and amount columns are not scaled
  robust_scaler = RobustScaler()
  df['Amount'] = robust_scaler.fit_transform(df[['Amount']])
  df['Time'] = robust_scaler.fit_transform(df[['Time']])
  cc_df_train, cc_df_test = train_test_split(df, test_size=0.2, random_state=32)

  cc_df_train_y = cc_df_train['Class']
  cc_df_train.drop(['Class'], inplace=True, axis=1)
  cc_df_test_y = cc_df_test['Class']
  cc_df_test.drop(['Class'], inplace=True, axis=1)

  # Create a SelectKBest object with mutual_info_classif as the scoring function
  select_k_best = SelectKBest(score_func=mutual_info_classif, k=22)
  _ = select_k_best.fit_transform(cc_df_train, cc_df_train_y)

  # Get the indices of the selected features
  selected_features_indices = select_k_best.get_support(indices=True)
  cc_df_train_new = cc_df_train.iloc[:, selected_features_indices]
  cc_df_test_new = cc_df_test.iloc[:, selected_features_indices]

  return cc_df_train_new, cc_df_train_y, cc_df_test_new, cc_df_test_y


In [27]:
df_train, df_train_y, df_test, df_test_y = cc_dataset_load_and_preprocessing()

In [None]:
RunLogisticRegression(df_train, df_train_y, df_test, df_test_y)
RunAdaBoostAlgo(df_train, df_train_y, df_test, df_test_y)