In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("WA_Fn_UseC_Telco_Customer_Churn.csv",na_values=["No internet service","No phone service"])

FileNotFoundError: ignored

In [None]:
df.dropna(inplace=True)

df.reset_index(drop=True)

df.dropna()

In [None]:
df = df.replace({
    "Churn": {
        "Yes" : 1,
        "No" : 0
    }
})

In [None]:
#To Drop Unnecessary Columns

df.drop(labels=["customerID", "Partner", "StreamingTV", "PhoneService"], 
        axis=1,
        inplace=True)

In [None]:
#Find Categorical and Numerical Columns

categorical_columns = pd.DataFrame(df, columns = ["gender", "Dependents", "MultipleLines", "InternetService",
                       "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", 
                       "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod"])
numerical_columns = pd.DataFrame(df, columns = ["SeniorCitizen", "tenure", "TotalCharges", "MonthlyCharges"])                       

In [None]:
categorical_columns.dropna()
numerical_columns.dropna()

In [None]:
#define feature matrix and response vector

data_x = categorical_columns.loc[:] #categorical_columns.columns != "Churn"
data_y = df["Churn"]

In [None]:
#Dropping the Numerical Columns
'''
numerical_columns.drop(labels=numerical_columns, 
        axis=1,
        inplace=True)
'''        

In [None]:
print(categorical_columns)  #Return Empty DataFrame

In [None]:
#Split the dataset (80% training, 20% testing) both with and without stratification (use random_state = 911)

X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, train_size = 0.8, random_state=911)  #Without stratify

X_train_stratified, X_test_stratified, y_train_stratified, y_test_stratified = train_test_split(
                                                                                                    data_x, data_y, 
                                                                                                    train_size = 0.8, stratify = data_y,
                                                                                                    random_state = 911 )  #With stratify

**Data are now Preprocessed**

In [None]:
def calc_event_frequency(col, lbl):
    tmp_list = []
    u = None
    total_yes = total_no = 0

    unique_events, num_of_events = np.unique(col, return_counts=True)
    
    for i in range(len(unique_events)):
        u = unique_events[i]
        count_yes = 0
        count_no = 0
        for j in range(len(col)):
            if col[j] == u and lbl[j] == 1: #yes
                count_yes += 1      #sum of total yes for a particular unique event
                total_yes += 1      #sum of total yes in the label set for all the unique events   
            elif col[j] == u and lbl[j] == 0: #No
                count_no += 1
                total_no += 1
        tmp_list.append(list([u, count_yes, count_no])) #[unique_event, P(event|yes), P(event|no)]
    
    for k in range(len(unique_events)): #calculating the probabilities for all the unique events in a particular feature/column,
        tmp_list[k][1] /= total_yes     #P(feature|yes)
        tmp_list[k][2] /= total_no      #P(feature|no)
    
    return tmp_list  

def fit(features, response):
    features = np.array(features)
    response = np.array(response)
    list_of_likelihood = []
    unique_elements, counts_elements = np.unique(response, return_counts=True) 
    
    # to get frequency of elements (counts_elements[1])
    p_of_no = (counts_elements[0] / (counts_elements[0] + counts_elements[1]))
    p_of_yes = (counts_elements[1] / (counts_elements[0] + counts_elements[1]))
    
    for col in range(features.shape[1]):
        each_column = features[:, col] #slicing each column
        unique_events, num_of_events = np.unique(each_column, return_counts=True)
        list_of_likelihood.append(calc_event_frequency(each_column, response))       #List of List for a complete feature
    
    return list_of_likelihood, p_of_yes, p_of_no    
        
probability_table = []   # [index(feature name),[probabilities for each unique events]]
probability_table, p_of_yes, p_of_no = fit(X_train, y_train)
for i in range(len(probability_table)):
    print(probability_table[i])

In [None]:
def predict(features, prob_table, p_yes, p_no):
    features = np.array(features)
    tmp = []
    for i in range(len(features)):  
        pred_yes = 1
        pred_no = 1

        for j in range(len(features[i])):
            for k in range(len(probability_table[j])):
                if features[i][j] == probability_table[j][k][0]:
                    pred_yes *= probability_table[j][k][1]
                    pred_no *= probability_table[j][k][2]
        pred_yes *= p_yes
        pred_no *= p_no
        if pred_yes >= pred_no:
            tmp.append(1)
        else:
            tmp.append(0)
    return tmp
decesion_list = predict(X_test, probability_table, p_of_yes, p_of_no)

In [None]:
def confusion_matrix(decesion_list, y_test):
    y_test = y_test.to_list()
    
    tp = tn = fp = fn = 0
    for i in range(len(decesion_list)):
        if decesion_list[i] == 1 and y_test[i] == 1:
            tp += 1
        elif decesion_list[i] == 0 and y_test[i] == 0:
            tn += 1
        elif decesion_list[i] == 0 and y_test[i] == 1:
            fn += 1 
        elif decesion_list[i] == 1 and y_test[i] == 0:
            fp += 1
    return tp, tn, fn, fp

tp, tn, fn, fp = confusion_matrix(decesion_list, y_test) 

print("TP = {}, TN = {}, FP = {}, FN = {}".format(tp, tn, fp, fn))



In [None]:

def precision_score(tp, tn, fn, fp):
    return ( tp / (fp + tp) )
def accuracy_score(tp, tn, fn, fp):
    return  (tp + tn)/ (tp + fn + tn + fp)
def recall_score(tp, tn, fn, fp):
    return tp / (fn + tp)
def f1_score(tp, tn, fn, fp):
    return 2* precision_score(tp, tn, fn, fp) * recall_score(tp, tn, fn, fp) / (precision_score(tp, tn, fn, fp) + recall_score(tp, tn, fn, fp))

print("Precision Score = {}".format(precision_score(tp, tn, fn, fp)))    
print("Accuracy Score = {}".format(accuracy_score(tp, tn, fn, fp)))    
print("Recall Score = {}".format(recall_score(tp, tn, fn, fp)))    
print("F-1 Score = {}".format(f1_score(tp, tn, fn, fp)))  

**For Numerical features colums**

In [None]:
numerical_data_x = numerical_columns.loc[:] #categorical_columns.columns != "Churn"
numerical_data_y = df["Churn"]


def numerical_fit(features, labels):
    


print(type(numerical_data_y))