# imports 

In [71]:
import pandas as pd
import numpy as np


# define functions 

In [72]:


# information gain to find the best feature
def info_gain(X,y,f) :
    # find unique values in the feature
    unique_vals = np.unique(X[f])
    # find entropy of the feature
    entropy_f = 0
    for i in unique_vals :
        entropy_f += (X[f] == i).sum()/X.shape[0]*np.log2((X[f] == i).sum()/X.shape[0])
    # find entropy of the dataset
    entropy_d = 0
    for i in np.unique(y) :
        entropy_d += (y == i).sum()/y.shape[0]*np.log2((y == i).sum()/y.shape[0])
    # find information gain
    return entropy_d - entropy_f





# tanh activation function
def tanh(x) :
    return np.tanh(x)


# sklearn confusion matrix
def confusion_matrix(y_true,y_pred) :
    from sklearn.metrics import confusion_matrix
    return confusion_matrix(y_true,y_pred)


def evaluate_model(y_true,y_pred) :
    
    cm = confusion_matrix(y_true,y_pred)

    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp+tn)/(tp+tn+fp+fn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*precision*recall/(precision+recall)

    print("-----------------------------------------------------")
    print("Accuracy : ",accuracy)
    print("Precision : ",precision)
    print("Recall : ",recall)
    print("F1 Score : ",f1)
    print("_____________________________________________________")




# logistic error 

def error(y,h) :
    return np.sum((y-h)**2)/y.shape[0]

# logistic regression implementation

def logistic_regression(X,y,iteration=1000,alpha=0.01):
    
    # initialize the weights
    w = np.zeros(X.shape[1])
     
    for i in range(iteration):
        # hypothesis
        h = tanh(np.dot(X,w.T))

        # update weights
       
        temp1 = y - h
        temp2 = 1 - h**2
        grad = np.dot(X.T,temp1*temp2)/ X.shape[0]
        w = w + alpha*grad

        #print(error(y,h))
       
    return w



# resample data according to the weights

def resample(X,y,w) :
    # X, y : examples
    # w : weights

    # initialize the new dataset
    X_new = np.zeros(X.shape)
    y_new = np.zeros(y.shape)

    # find the index of the examples to be resampled
    index = np.random.choice(X.shape[0],X.shape[0],replace=True,p=w)
    

    # resample the examples
    for i in range(X.shape[0]):
        X_new[i] = X.iloc[index[i]]
        y_new[i] = y.iloc[index[i]]
    

    return X_new, y_new

def AdaBoost(X,y,K) :
    # X, y : examples
    # K : number of hypothesis in the ensemble

    # initialize the weights
    w = np.ones(X.shape[0])/X.shape[0]

    # initialize the hypothesis vector
    h = []

    # initialize the hypothesis weight vector
    z = []

    for i in range(K) :
        # resample the data according to the weights
        X_new, y_new = resample(X,y,w)
   
        # Kth hypothesis
        Th = logistic_regression(X_new,y_new)

        # find the error
        error = 0

        for j in range(X.shape[0]) :
            if y_new[j] != np.sign(np.dot(X_new[j],Th.T)) :
                error += w[j]
            
        if error > 0.5 :
            continue

        for j in range(X.shape[0]) :
            if y_new[j] == np.sign(np.dot(X_new[j],Th.T)) :
                w[j] = w[j]*error/(1-error)
        
        # normalize the weights
        w = w/np.sum(w)

        # append the hypothesis and its weight
        h.append(Th)
        z.append((1-error)/error)
        print("Ensemble : ",i+1, "weight : ",z[i])
    
        
    return weighted_majority(h,z)


def weighted_majority(h,z) :
    h_new = np.zeros(h[0].shape)
    for i in range(len(z)) :
        h_new += z[i]*h[i]

    return h_new







# Dataset 1

In [74]:
# read dataset 1
dataset1 = pd.read_csv("dataset_1.csv")
dataset1.head()

# drop the first column
dataset1.drop(['customerID'], axis=1, inplace=True)

dataset1['TotalCharges'] = pd.to_numeric(dataset1['TotalCharges'] , errors='coerce')
dataset1.TotalCharges.dtype

# covert the label to -1 and 1
dataset1['Churn'] = dataset1['Churn'].apply(lambda x: -1 if x == 'No' else 1)

# add bias column as first column
dataset1.insert(0, 'bias', 1)



# normalize the data
dataset1['tenure'] = (dataset1['tenure'] - dataset1['tenure'].mean())/dataset1['tenure'].std()
dataset1['MonthlyCharges'] = (dataset1['MonthlyCharges'] - dataset1['MonthlyCharges'].mean())/dataset1['MonthlyCharges'].std()
dataset1['TotalCharges'] = (dataset1['TotalCharges'] - dataset1['TotalCharges'].mean())/dataset1['TotalCharges'].std()

# find categorical variables
cat_vars = []
for i in dataset1.columns :
    if dataset1[i].dtype == 'object' :
        cat_vars.append(i)

# convert categorical variables to integers
for i in cat_vars :
    dataset1[i] = dataset1[i].astype('category').cat.codes


# one hot encoding for categorical variables with less than 5 unique values
for i in cat_vars :
    if dataset1[i].nunique() < 3 :
        dataset1 = pd.get_dummies(dataset1,drop_first=True,columns=[i])

# drop rows with null values
dataset1.dropna(inplace=True)

# X and y
y = dataset1['Churn']
X = dataset1.drop(['Churn'],axis=1)


# split the data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)


# -------------------------------------------------------------

model = AdaBoost(X_train,y_train,10)

h = np.dot(X_test,model.T)
y_pred = np.sign(h).astype(int)


print(confusion_matrix(y_test,y_pred))
evaluate_model(y_test,y_pred)




Ensemble :  1 weight :  3.9385425812116153
Ensemble :  2 weight :  3.905775446610879
Ensemble :  3 weight :  4.08505678470163
Ensemble :  4 weight :  3.8875477813184274
Ensemble :  5 weight :  4.012056798179578
Ensemble :  6 weight :  4.455599664559551
Ensemble :  7 weight :  3.8453443860954537
Ensemble :  8 weight :  4.073617436603568
Ensemble :  9 weight :  4.8652838512249375
Ensemble :  10 weight :  5.118234884269043
[[946  92]
 [207 162]]
-----------------------------------------------------
Accuracy :  0.7874911158493249
Precision :  0.6377952755905512
Recall :  0.43902439024390244
F1 Score :  0.5200642054574639
_____________________________________________________
