# imports 

In [172]:
import pandas as pd
import numpy as np


# define functions 

In [173]:


# information gain to find the best feature
def info_gain(X,y,f) :
    # find unique values in the feature
    unique_vals = np.unique(X[f])
    # find entropy of the feature
    entropy_f = 0
    for i in unique_vals :
        entropy_f += (X[f] == i).sum()/X.shape[0]*np.log2((X[f] == i).sum()/X.shape[0])
    # find entropy of the dataset
    entropy_d = 0
    for i in np.unique(y) :
        entropy_d += (y == i).sum()/y.shape[0]*np.log2((y == i).sum()/y.shape[0])
    # find information gain
    return entropy_d - entropy_f





# tanh activation function
def tanh(x) :
    return np.tanh(x)



# evaluate the model
def evaluate_model(y_true,y_pred) :
    
    TP = 0
    FP = 0
    TN = 0
    FN = 0


    for i in range(y_true.shape[0]) :
        if y_true[i] == 1 and y_pred[i] == 1 :
            TP += 1
        elif y_true[i] == -1 and y_pred[i] == 1 :
            FP += 1
        elif y_true[i] == 1 and y_pred[i] == -1 :
            FN += 1
        else :
            TN += 1


    accuracy = (TP+TN)/(TP+TN+FP+FN)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    specificity = TN/(TN+FP)
    f1 = 2*precision*recall/(precision+recall)

    print("-----------------------------------------------------")
    print("Accuracy: ",accuracy)
    print("Precision: ",precision)
    print("Specificity: ",specificity)
    print("Recall: ",recall)
    print("F1: ",f1)
    print("-----------------------------------------------------")



# logistic error 

def error(y,h) :
    return np.mean(abs(y-h))

# logistic regression implementation

def logistic_regression(X,y,err_tol=0.5,alpha=0.01):
    
    # initialize the weights
    w = np.zeros(X.shape[1])

    err = 1
    while err >= err_tol :
        # calculate the hypothesis
        h = tanh(np.dot(X,w))
        # calculate the gradient
        grad = np.dot(X.T,(h-y))
        # update the weights
        w = w - alpha*grad
        # calculate the error
        err = error(y,h)
        print("Error: ",err)

       
    
    return w
    





    


# load dataset

In [174]:
# read dataset 1
dataset1 = pd.read_csv("dataset_1.csv")
dataset1.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# preprocessing Dataset 1

In [175]:
# drop the first column
dataset1.drop(['customerID'], axis=1, inplace=True)

dataset1['TotalCharges'] = pd.to_numeric(dataset1['TotalCharges'] , errors='coerce')
dataset1.TotalCharges.dtype

# covert the label to -1 and 1
dataset1['Churn'] = dataset1['Churn'].apply(lambda x: -1 if x == 'No' else 1)

# add bias column as first column
# dataset1.insert(0, 'bias', 1)



# normalize the data
dataset1['tenure'] = (dataset1['tenure'] - dataset1['tenure'].mean())/dataset1['tenure'].std()
dataset1['MonthlyCharges'] = (dataset1['MonthlyCharges'] - dataset1['MonthlyCharges'].mean())/dataset1['MonthlyCharges'].std()
dataset1['TotalCharges'] = (dataset1['TotalCharges'] - dataset1['TotalCharges'].mean())/dataset1['TotalCharges'].std()

# find categorical variables
cat_vars = []
for i in dataset1.columns :
    if dataset1[i].dtype == 'object' :
        cat_vars.append(i)

# convert categorical variables to integers
for i in cat_vars :
    dataset1[i] = dataset1[i].astype('category').cat.codes


# one hot encoding for categorical variables with less than 5 unique values
for i in cat_vars :
    if dataset1[i].nunique() < 5 :
        dataset1 = pd.get_dummies(dataset1,drop_first=True,columns=[i])

# drop rows with null values
dataset1.dropna(inplace=True)

# X and y
y = dataset1['Churn']
X = dataset1.drop(['Churn'],axis=1)


# split the data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

X_train.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_1,Partner_1,Dependents_1,PhoneService_1,MultipleLines_1,MultipleLines_2,...,StreamingTV_1,StreamingTV_2,StreamingMovies_1,StreamingMovies_2,Contract_1,Contract_2,PaperlessBilling_1,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3
2964,0,-0.340852,-0.213083,-0.413099,1,1,0,1,0,1,...,0,0,0,0,1,0,1,0,1,0
5113,0,1.572869,1.624734,2.595542,0,1,1,1,0,1,...,0,1,0,1,0,1,0,1,0,0
5363,0,1.532152,-1.309792,-0.250643,1,1,1,1,0,1,...,1,0,1,0,0,1,0,0,0,1
5074,0,0.677085,-1.50587,-0.600855,0,0,1,1,0,0,...,1,0,1,0,1,0,0,1,0,0
156,0,-0.422287,0.644343,-0.203528,0,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0


# Test base model 

In [176]:

# model = logistic_regression(X_train,y_train)

# y_pred = np.sign(np.dot(X_test,model))

# # sklearn model evaluation
# from sklearn.metrics import accuracy_score

# accuracy_score(y_test,y_pred)


# AdaBoost

In [177]:

# resample data according to the weights

def resample(X,y,w) :
    # X, y : examples
    # w : weights

    # initialize the new dataset
    X_new = np.zeros(X.shape)
    y_new = np.zeros(y.shape)

    # find the index of the examples to be resampled
    index = np.random.choice(X.shape[0],X.shape[0],p=w)

    # resample the examples
    for i in range(X.shape[0]) :
        X_new[i] = X[index[i]]
        y_new[i] = y[index[i]]

    return X_new, y_new

def AdaBoost(X,y,K) :
    # X, y : examples
    # K : number of hypothesis in the ensemble

    # initialize the weights
    w = np.ones(X.shape[0])/X.shape[0]

    # initialize the hypothesis vector
    h = []

    # initialize the hypothesis weight vector
    z = []

    for i in range(K) :
        print('ensemble :', i+1, end=" ")
        X_new, y_new = resample(X,y,w)
        
        
        # Kth hypothesis
        Th = logistic_regression(X_new,y_new)

        # find the error
        error = 0

        for j in range(X.shape[0]) :
            if np.sign(y[j]) != np.sign(np.dot(X[j],Th)) :
                error += w[j]
        
        print('error :', error)
        if error > 0.5 :
            continue

        for j in range(X.shape[0]) :
            if np.sign(y[j]) == np.sign(np.dot(X[j],Th)) :
                w[j] = w[j]*error/(1-error)
            
        # normalize weights
        w = w/np.sum(w)

        # find the hypothesis weight
        h.append(Th)
        z.append(np.log((1-error)/error))

    
    return weighted_majority(h,z)


def weighted_majority(h,z) :
    h_new = np.zeros(h[0].shape)
    for i in range(len(z)) :
        print('weight :', z[i])
        h_new += z[i]*h[i]

    return h_new




In [178]:
# train the model
Th = AdaBoost(X_train,y_train,5)

# evaluate the model
y_pred = np.tanh(np.dot(X_test,Th))
y_pred = np.sign(y_pred)

# sklearn model evaluation
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)




ensemble : 1 

TypeError: 'int' object is not subscriptable

# evaluate model