In [None]:
!pip install tabulate



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from tqdm import tqdm
import seaborn as sns
from tabulate import tabulate

In [None]:
class LogisticRegression:
    def __init__(self, learning_rate=0.05, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iterations) :
            z = np.dot(X, self.weights) + self.bias
            y_hat = self.sigmoid(z)

            # Gradient Descent
            dw = (1/n_samples) * np.dot(X.T, (y_hat - y))
            db = (1/n_samples) * np.sum(y_hat - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        z = np.dot(X, self.weights) + self.bias
        predictions = self.sigmoid(z)
        return np.round(predictions)

In [None]:
class AdaBoost:
    def __init__(self, n_hypotheses=50):
        self.n_hypotheses = n_hypotheses
        self.models = []
        self.model_weights = []

    def resample(self, X, y, weights):
        n_samples = X.shape[0]
        sample_indices = np.random.choice(n_samples, n_samples, replace=True)

        X_sampled = X[sample_indices]
        Y_sampled = y[sample_indices]
        weights_sampled = weights[sample_indices]

        return X[sample_indices], y[sample_indices], weights[sample_indices]

    def fit(self, X, y):
        n_samples = X.shape[0]
        weights = np.ones(n_samples) / n_samples

        for _ in range(self.n_hypotheses):
            X_sampled, y_sampled, weights_sampled = self.resample(X, y, weights)
            weak_model = LogisticRegression()
            weak_model.fit(X_sampled, y_sampled)
            self.models.append(weak_model)

            error = 0
            y_hat = weak_model.predict(X)

            error = np.sum(weights[y_hat != y])

            # Ensure the error is not exactly 0 or 1 to avoid division by zero
            error = max(error, 1e-10)
            error = min(error, 1 - 1e-10)

            if error > 0.5:
                continue

            weights[y_hat == y] *= error / (1 - error)

            # normalize weights
            weights /= np.sum(weights)

            self.model_weights.append(np.log((1 - error) / error))

            # # Calculate alpha (weight for the weak learner)
            # alpha = 0.5 * np.log((1 - error) / error)
            # self.alphas.append(alpha)

            # # Update weights
            # weights *= np.exp(-alpha * y * predictions)
            # weights /= np.sum(weights)

    def predict(self, X):
        # Combine predictions of weak learners using weighted sum
        predictions = np.sum(alpha * model.predict(X) for alpha, model in zip(self.model_weights, self.models))
        return np.sign(predictions)

In [None]:
def calculate_accuracy(y_true, y_pred):
    correct_predictions = np.sum(y_true == y_pred)
    total_predictions = len(y_true)
    accuracy = correct_predictions / total_predictions
    return accuracy*100

def calculate_true_positive_rate(y_true, y_pred):
    true_positives = np.sum((y_true == 1) & (y_pred == 1))
    actual_positives = np.sum(y_true == 1)
    true_positive_rate = true_positives / actual_positives
    return true_positive_rate*100

def calculate_true_negative_rate(y_true, y_pred):
    true_negatives = np.sum((y_true == 0) & (y_pred == 0))
    actual_negatives = np.sum(y_true == 0)
    true_negative_rate = true_negatives / actual_negatives
    return true_negative_rate*100

def calculate_positive_predictive_value(y_true, y_pred):
    true_positives = np.sum((y_true == 1) & (y_pred == 1))
    predicted_positives = np.sum(y_pred == 1)
    positive_predictive_value = true_positives / predicted_positives
    return positive_predictive_value*100

def calculate_false_discovery_rate(y_true, y_pred):
    false_positives = np.sum((y_true == 0) & (y_pred == 1))
    predicted_positives = np.sum(y_pred == 1)
    false_discovery_rate = false_positives / predicted_positives
    return false_discovery_rate*100

def calculate_f1_score(y_true, y_pred):
    true_positives = np.sum((y_true == 1) & (y_pred == 1))
    false_positives = np.sum((y_true == 0) & (y_pred == 1))
    false_negatives = np.sum((y_true == 1) & (y_pred == 0))
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score*100

In [None]:
def print_performance_measures(y_true, y_pred):
    results = np.zeros((1, 6))  # 1 row, 6 columns

    results[0, 0] = calculate_accuracy(y_true, y_pred).round(2)
    results[0, 1] = calculate_true_positive_rate(y_true, y_pred).round(2)
    results[0, 2] = calculate_true_negative_rate(y_true, y_pred).round(2)
    results[0, 3] = calculate_positive_predictive_value(y_true, y_pred).round(2)
    results[0, 4] = calculate_false_discovery_rate(y_true, y_pred).round(2)
    results[0, 5] = calculate_f1_score(y_true, y_pred).round(2)

    results_str = np.char.add(results.astype(str), '%')

    # Print the results in a table format
    column_names = ['Accuracy', 'Sensitivity (TPR)', 'Specificity (TNR)', 'Precision', 'FDR', 'F1_score']
    print(tabulate(results_str, headers=column_names, tablefmt='grid'))


In [None]:
def load_and_split_dataset(filename):
  df = pd.read_csv('./'+filename)
  df_train, df_test = train_test_split(df, test_size=0.2, random_state=47)
  return df_train, df_test

In [None]:
def churn_dataset_visualization(df1):

  colors = ['#E94B3C', '#2D2926']

  churned_data = df1[df1['Churn'] == 1]
  not_churned_data = df1[df1['Churn'] == 0]

  fig, axes = plt.subplots(nrows=len(df1.columns)-1, ncols=1, figsize=(5, 5 * len(df1.columns)))

  for i, feature in enumerate(df1.columns[:-1]):  # Exclude the 'Churn' column
      ax = axes[i]

      # Plot for both Churned and Not Churned Customers
      sns.barplot(x='Churn', y=feature, data=df1, ax=ax, palette=colors)
      ax.set_title(f'{feature}')
      ax.set_ylabel('')
      ax.set_xticklabels(['Churned', 'Not Churned'])

  plt.tight_layout()
  plt.show()

In [None]:
# churn_dataset_visualization(df_train)

In [None]:
def calculate_entropy(class_labels):

    unique_labels, counts = np.unique(class_labels, return_counts=True)
    probabilities = counts / len(class_labels)
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))

    return entropy

In [None]:
def information_gain(X, y, feature_index):
    """
    Calculate information gain for a specific feature in a dataset.

    Parameters:
    - X: 2D numpy array, input features (rows are samples, columns are features)
    - y: 1D numpy array, target variable
    - feature_index: int, index of the feature for which information gain is calculated

    Returns:
    - information_gain: float, information gain for the specified feature
    """
    total_entropy = calculate_entropy(y)

    unique_values = np.unique(X.iloc[:, feature_index])
    weighted_entropies = np.zeros_like(unique_values, dtype=float)

    for i, value in enumerate(unique_values):
        subset_indices = X.iloc[:, feature_index] == value
        subset_entropy = calculate_entropy(y[subset_indices])
        weighted_entropies[i] = np.sum(subset_indices) / len(y) * subset_entropy

    information_gain = total_entropy - np.sum(weighted_entropies)

    return information_gain


In [None]:
def churn_dataset_feature_selection(X, y, k=15):
    num_features = X.shape[1]
    info_gains = []

    for feature_index in range(num_features):
        gain = information_gain(X, y, feature_index)
        info_gains.append((feature_index, gain))

    # Sort features based on information gain in descending order
    sorted_info_gains = sorted(info_gains, key=lambda x: x[1], reverse=True)

    # Extract the top k feature indices
    selected_columns = [index for index, _ in sorted_info_gains[:k]]

    # print(X.columns[selected_columns])

    # Select only the top k features from the dataset
    X_selected = X.iloc[:, selected_columns]

    return X_selected


In [None]:
def churn_dataset_preprocessing(df, no_of_selected_features=16):
  df.drop('customerID', axis='columns', inplace=True)

  # Change TotalCharges to numeric and NaN values filled with mean
  df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
  numeric_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
  df['TotalCharges'] = numeric_imputer.fit_transform(df[['TotalCharges']])

  #Standardizing numeric columns
  standard_scaler = StandardScaler()

  for key in ['tenure', 'MonthlyCharges', 'TotalCharges']:
      df[key] = standard_scaler.fit_transform(df[[key]])

  df['Churn'].replace({'Yes': 1, 'No': 0}, inplace=True)
  df_y = df['Churn']
  df.drop(['Churn'], inplace=True, axis=1)

  # do feature selection before one-hot encoding
  df = churn_dataset_feature_selection(df, df_y, no_of_selected_features)

  # Identify categorical features
  categorical_features = [col for col in df.columns if col not in df.describe().columns]

  # Identify binary categorical features
  binary_categorical_features = [col for col in categorical_features if df[col].nunique() == 2]

  # Identify non-binary categorical features
  non_binary_categorical_features = [col for col in categorical_features if col not in binary_categorical_features]

  # Apply label encoding for binary categorical features
  label_encoder = LabelEncoder()

  for feature in binary_categorical_features :
    df[feature] = label_encoder.fit_transform(df[feature])

  # Apply one-hot encoding using get_dummies for non-binary categorical features
  df = pd.get_dummies(df, columns=non_binary_categorical_features)

  return df, df_y

In [None]:
df_train, df_test = load_and_split_dataset('WA_Fn-UseC_-Telco-Customer-Churn.csv')
# df_train.head()
# df_train.info()
# df_train.describe().T
df_train, df_train_y = churn_dataset_preprocessing(df_train)
df_test, df_test_y = churn_dataset_preprocessing(df_test)
# df_train.head()

Index(['TotalCharges', 'MonthlyCharges', 'Contract', 'tenure',
       'OnlineSecurity', 'TechSupport', 'InternetService', 'OnlineBackup',
       'DeviceProtection', 'PaymentMethod', 'StreamingMovies', 'StreamingTV',
       'PaperlessBilling', 'Dependents', 'Partner', 'SeniorCitizen'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = label_encoder.fit_transform(df[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = label_encoder.fit_transform(df[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = label_encoder.fit_transform(df[feature])


Index(['TotalCharges', 'MonthlyCharges', 'Contract', 'tenure',
       'OnlineSecurity', 'TechSupport', 'InternetService', 'PaymentMethod',
       'DeviceProtection', 'OnlineBackup', 'StreamingMovies', 'StreamingTV',
       'PaperlessBilling', 'SeniorCitizen', 'Dependents', 'Partner'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = label_encoder.fit_transform(df[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = label_encoder.fit_transform(df[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = label_encoder.fit_transform(df[feature])


In [None]:
for K in [5, 10, 15, 20]:
  adaboost = AdaBoost(K)
  adaboost.fit(df_train.to_numpy(), df_train_y.to_numpy())
  y_predict = adaboost.predict(df_test.to_numpy())
  # print('\n')
# print(calculate_f1_score(df_test_y.to_numpy(), y_predict))
  print(calculate_accuracy(df_test_y.to_numpy(), y_predict))

  predictions = np.sum(alpha * model.predict(X) for alpha, model in zip(self.model_weights, self.models))


77.85663591199432
77.64371894960965
77.85663591199432
77.71469127040454


In [None]:
print_performance_measures(df_test_y.to_numpy(), y_predict)

+------------+---------------------+---------------------+-------------+-------+------------+
| Accuracy   | Sensitivity (TPR)   | Specificity (TNR)   | Precision   | FDR   | F1_score   |
| 77.71%     | 44.42%              | 90.23%              | 63.1%       | 36.9% | 52.13%     |
+------------+---------------------+---------------------+-------------+-------+------------+


In [None]:
logistic_regression = LogisticRegression()
logistic_regression.fit(df_train.to_numpy(), df_train_y.to_numpy())
y_predict = logistic_regression.predict(df_test.to_numpy())
print('\n')
print_performance_measures(df_test_y.to_numpy(), y_predict)

100%|██████████| 1000/1000 [00:00<00:00, 2539.64it/s]



+------------+---------------------+---------------------+-------------+--------+------------+
| Accuracy   | Sensitivity (TPR)   | Specificity (TNR)   | Precision   | FDR    | F1_score   |
| 77.64%     | 36.88%              | 92.97%              | 66.36%      | 33.64% | 47.41%     |
+------------+---------------------+---------------------+-------------+--------+------------+



