In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
import re
from IPython.display import display
from time import strftime, gmtime

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, KBinsDiscretizer, LabelEncoder
# Column Transformer
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('downloads/Master_churn_data.csv')

In [None]:
pd.set_option('display.max_columns', 30)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.describe

In [None]:
# Frequency tables for each categorical feature
for column in df.select_dtypes(include=['object']).columns:
    display(pd.crosstab(index=df[column], columns='% observations', normalize='columns'))

In [None]:
# Histograms for each numeric features
hist = df.hist(bins=30, sharey=True, figsize=(10, 10))

In [None]:
#df['Churn'] = df['Churn'].astype(object)

In [None]:
df['advertiser_id'] = df['advertiser_id'].astype(object)

In [None]:
for column in df.select_dtypes(include=['object']).columns:
    if column != 'Churn':
        display(pd.crosstab(index=df[column], columns=df['Churn'], normalize='columns'))

In [None]:
for column in df.select_dtypes(exclude=['object']).columns:
    print(column)
    hist = df[[column, 'Churn']].hist(by='Churn', bins=30)
    plt.show()

In [None]:
display(df.corr())

In [None]:
scatter_matrix = pd.plotting.scatter_matrix(df, figsize=(12, 12))

for ax in scatter_matrix.ravel():
    ax.set_xlabel(ax.get_xlabel(), fontsize = 10, rotation = 45)
    ax.set_ylabel(ax.get_ylabel(), fontsize = 10, rotation = 45)        
    
plt.show()    

In [None]:
# Drop columns that are highly correlated with other columns
df.drop(columns=['rext_USD_plan_rate'], inplace=True)
df.drop(columns=['advertiser_id'], inplace=True)
df.drop(columns=['advertiser_name'], inplace=True)
df.drop(columns=['global_account_name'], inplace=True)

In [None]:
#missing data
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

In [None]:
df.dtypes

In [None]:
#encode_features =  ["Churn?", "State", "Int'l Plan", "VMail Plan"]
# df['Churn?'].unique()

sales_geo_team_list = ['UK', 'EE', 'BENELUX', 'FRANCE', 'NORDICS', 'DACH', 'IBERIA',
       'TURKEY', 'ITALY', 'EMEA', 'MEA', 'EASTERN EUROPE', 'GROUP', 'CMR',
       'NORTHERN EUROPE', 'AMS BRAND AS ATO', 'CORE US',
       'SOUTH EAST ASIA', 'RETAIL BNL', 'RUSSIA', 'NB CORE US',
       'SUPPLY - CPP - GERMANY', 'BRAZIL', 'BRAND FRANCE AS', 'NB LATAM']
sales_subregion_list = ['CORE EUROPE', 'EMERGING MARKETS', 'AGENCY', 'NEW BUSINESS',
       'KSA EUROPE', 'EUROPE', 'EMEA', 'GROUP', 'AMERICAS BRAND AS', 'US',
       'SOUTH ASIA', 'EMEA RETAIL NEE', 'SUPPLY - CPP - EMEA', 'LATAM',
       'EMEA BRAND SEE', 'EMEA RETAIL SEE']
sales_channel_list = ['Direct', 'Agency']
product_taxonomy_label_list = ['commerce growth - acquisition',
       'commerce growth - retention retargeting',
       'commerce growth - retention', 'unknown', 'undefined']
Stream_list = ['CORE', 'EM', 'KSA', 'AG', 'EMEA', 'NEW BUSINESS']
Churn_list = ['True.','False.']

# Encode Class Labels to integers
sales_geo_team_le = LabelEncoder()
sales_geo_team_le.fit(sales_geo_team_list)

sales_subregion_le = LabelEncoder()
sales_subregion_le.fit(sales_subregion_list)

sales_channel_le = LabelEncoder()
sales_channel_le.fit(sales_channel_list)

product_taxonomy_label_le = LabelEncoder()
product_taxonomy_label_le.fit(product_taxonomy_label_list)

stream_le = LabelEncoder()
stream_le.fit(Stream_list)

churn_le = LabelEncoder()
churn_le.fit(Churn_list)

In [None]:
df.head()

In [None]:
# Encode specific columns
df['sales_geo_team'] = sales_geo_team_le.transform(df['sales_geo_team'])
df['sales_subregion']= sales_subregion_le.transform(df['sales_subregion'])
df['sales_channel'] = sales_channel_le.transform(df['sales_channel'])
df['product_taxonomy_label'] = product_taxonomy_label_le.transform(df['product_taxonomy_label'])
df['Stream'] = stream_le.transform(df['Stream'])
df['Churn'] = churn_le.transform(df['Churn'])

In [None]:
df.dtypes

In [None]:
#df['Churn'] = df['Churn'].astype(int)

In [None]:
categorical_features =['sales_geo_team','sales_subregion','sales_channel','product_taxonomy_label', 'Stream']

numeric_features = ['displays','clicks',
                    'revenue_USD_plan_rate','no_of_days']

In [None]:
categorical_features + numeric_features

In [None]:
colTransformer = ColumnTransformer([('onehot',
                                     OneHotEncoder(categories='auto',sparse=False),
                                     categorical_features),
                                    ('standardize',
                                    StandardScaler(),numeric_features)
                                   ],
                                   remainder="passthrough")

In [None]:
colTransformer.fit(df[categorical_features + numeric_features])

In [None]:
train_data, validation_data, test_data = np.split(df.sample(frac=1, random_state=1729), 
                                                  [int(0.7 * len(df)), int(0.9 * len(df))])

In [None]:
print(train_data.shape,validation_data.shape,test_data.shape)

In [None]:
# Output columns
columns = [ "Churn",
"sales_geo_team",
"sales_subregion",
"sales_channel",
"product_taxonomy_label" ,"Stream"        ,"displays","clicks","revenue_USD_plan_rate","no_of_days"  ]

In [None]:
train_data[columns].head()

In [None]:
train_data_transformed = colTransformer.transform (train_data[categorical_features + numeric_features])

validation_data_transformed = colTransformer.transform (validation_data[categorical_features + numeric_features])

test_data_transformed = colTransformer.transform (test_data[categorical_features + numeric_features])

In [None]:
print(train_data_transformed.shape, validation_data_transformed.shape, test_data_transformed.shape)

In [None]:
train_data['Churn'].values

In [None]:
train_data_transformed = np.concatenate((np.array([train_data['Churn']]).T, train_data_transformed),axis=1)
validation_data_transformed = np.concatenate((np.array([validation_data['Churn']]).T, validation_data_transformed),axis=1)
test_data_transformed = np.concatenate((np.array([test_data['Churn']]).T, test_data_transformed),axis=1)

In [None]:
print(train_data_transformed.shape, validation_data_transformed.shape, test_data_transformed.shape)

In [None]:
train_data_transformed[:1]

In [None]:
# https://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
# Write Training Set
np.savetxt('train_onehot.csv',train_data_transformed,delimiter=",",fmt='%.5e')

In [None]:
# Write Validation Set
np.savetxt('validation_onehot.csv',validation_data_transformed,delimiter=",",fmt='%.5e')

In [None]:
# Write Test Set
np.savetxt('test_onehot.csv',test_data_transformed,delimiter=",",fmt='%.5e')

In [None]:
# Write Training Set
train_data.to_csv('train.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Write Validation Set
validation_data.to_csv('validation.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Write Test Set
test_data.to_csv('test.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Write Column List
with open('column_list.txt','w') as f:
    f.write(','.join(columns))

In [None]:
#pip install tensorflow==1.2.0 --ignore-installed

In [None]:
 #pip install tensorflow

In [None]:
# https://keras.io/
# https://github.com/keras-team/keras/issues/2743
# Change Kernel to use Tensor Flow. For example: conda_tensorflow_p36
import sys
import numpy as np
# Set random seed
np.random.seed(0)

import pandas as pd
import matplotlib.pyplot as plt
import itertools

from sklearn.metrics import classification_report, confusion_matrix

# Column Transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, KBinsDiscretizer

# Keras Library
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

In [None]:
ls

In [None]:
train_file = 'train_onehot.csv'
validation_file = 'validation_onehot.csv'
test_file = 'test_onehot.csv'

In [None]:
# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file, header=None)
df_validation = pd.read_csv(validation_file, header=None)
df_test = pd.read_csv(test_file, header=None)

In [None]:
df_train.shape

In [None]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:] # Features: 1st column onwards 
y_validation = df_validation.iloc[:,0].ravel() # Target: 0th column

X_test = df_test.iloc[:,1:] # Features: 1st column onwards 
y_test = df_test.iloc[:,0].ravel() # Target: 0th column

In [None]:
# https://keras.io/getting-started/sequential-model-guide/
model = Sequential()
# 1 hidden layer with 30 neurons with relu activation
# output layer - binaryclassification, so use sigmoid activation
# optimizer - use adam or rmsprop
# loss function - logistic loss function - called as binary cross entropy in keras
# metrics - additional metrics to report
model.add(Dense(30, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [None]:
# Train the model, iterating on the data in batches of 32 samples
history = model.fit(X_train, y_train, epochs=20, batch_size=32,
         validation_data=(X_validation,y_validation),callbacks=[early_stopping])

In [None]:
plt.scatter(x=history.epoch,y=history.history['loss'],label='Training Error')
plt.scatter(x=history.epoch,y=history.history['val_loss'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

In [None]:
# Predicts a binary outcome for each observation
result = model.predict(X_test)

In [None]:
result[:10]

In [None]:
np.savetxt('results.csv',result,delimiter=",",fmt='%.5e')

In [None]:
column_list_file = 'column_list.txt'
test_file = 'test.csv'

In [None]:
columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

In [None]:
df_test = pd.read_csv(test_file,names=columns)

In [None]:
df_test['predicted_prob'] = result

In [None]:
df_test['predicted_class'] = np.where(result > 0.5,1,0)

In [None]:
df_test[['Churn','predicted_class', 'predicted_prob']].head(10)

In [None]:
# Reference: https://scikit-learn.org/stable/modules/model_evaluation.html
# Explicitly stating labels. Pass=1, Fail=0
def true_positive(y_true, y_pred): 
    return confusion_matrix(y_true, y_pred,labels=[1,0])[0, 0]

def true_negative(y_true, y_pred): 
    return confusion_matrix(y_true,y_pred,labels=[1,0])[1, 1]

def false_positive(y_true, y_pred): 
    return confusion_matrix(y_true, y_pred,labels=[1,0])[1, 0]

def false_negative(y_true, y_pred): 
    return confusion_matrix(y_true, y_pred,labels=[1,0])[0, 1]

In [None]:
# Compute Binary Classifier Metrics
# Returns a dictionary {"MetricName":Value,...}

def binary_classifier_metrics(y_true, y_pred):
    metrics = {}

    # References: 
    #  https://docs.aws.amazon.com/machine-learning/latest/dg/binary-classification.html
    #  https://en.wikipedia.org/wiki/Confusion_matrix
    
    # Definition:
    # true positive = tp = how many samples were correctly classified as positive (count)
    # true negative = tn = how many samples were correctly classified as negative (count)
    # false positive = fp = how many negative samples were mis-classified as positive (count)
    # false_negative = fn = how many positive samples were mis-classified as negative (count)
    
    # positive = number of positive samples (count)
    #          = true positive + false negative
    # negative = number of negative samples (count)
    #          = true negative + false positive
    
    tp = true_positive(y_true, y_pred)
    tn = true_negative(y_true, y_pred)
    fp = false_positive(y_true, y_pred)
    fn = false_negative(y_true, y_pred)
    
    positive = tp + fn
    negative = tn + fp
    
    metrics['TruePositive'] = tp
    metrics['TrueNegative'] = tn
    metrics['FalsePositive'] = fp
    metrics['FalseNegative'] = fn
    
    metrics['Positive'] = positive
    metrics['Negative'] = negative
    
    # True Positive Rate (TPR, Recall) = true positive/positive
    # How many positives were correctly classified? (fraction)
    # Recall value closer to 1 is better. closer to 0 is worse
    if tp == 0:
        recall = 0
    else:
        recall = tp/positive
        
    metrics['Recall'] = recall
    
    # True Negative Rate = True Negative/negative
    # How many negatives were correctly classified? (fraction)
    # True Negative Rate value closer to 1 is better. closer to 0 is worse
    if tn == 0:
        tnr = 0
    else:
        tnr = tn/(negative)
    metrics['TrueNegativeRate'] = tnr
    
    # Precision = True Positive/(True Positive + False Positive)
    # How many positives classified by the algorithm are really positives? (fraction)
    # Precision value closer to 1 is better. closer to 0 is worse
    if tp == 0:
        precision = 0
    else:
        precision = tp/(tp + fp)
    metrics['Precision'] = precision
    
    # Accuracy = (True Positive + True Negative)/(total positive + total negative)
    # How many positives and negatives were correctly classified? (fraction)
    # Accuracy value closer to 1 is better. closer to 0 is worse
    accuracy = (tp + tn)/(positive + negative)
    metrics['Accuracy'] = accuracy
    
    # False Positive Rate (FPR, False Alarm) = False Positive/(total negative)
    # How many negatives were mis-classified as positives (fraction)
    # False Positive Rate value closer to 0 is better. closer to 1 is worse
    if fp == 0:
        fpr = 0
    else:
        fpr = fp/(negative)
    metrics['FalsePositiveRate'] = fpr
    
    # False Negative Rate (FNR, Misses) = False Negative/(total Positive)
    # How many positives were mis-classified as negative (fraction)
    # False Negative Rate value closer to 0 is better. closer to 1 is worse
    fnr = fn/(positive)
    metrics['FalseNegativeRate'] = fnr
    
    # F1 Score = harmonic mean of Precision and Recall
    # F1 Score closer to 1 is better. Closer to 0 is worse.
    if precision == 0 or recall == 0:
        f1 = 0
    else:        
        f1 = 2*precision*recall/(precision+recall)

    metrics['F1'] = f1
    
    return metrics

In [None]:
# Reference: 
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
    #    print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
# Compute confusion matrix
# 1 = customer left/churn, 0 = stayed
cnf_matrix = confusion_matrix(df_test['Churn'], df_test['predicted_class'],labels=[1,0])

In [None]:
# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix,classes=['Churn','Stay'],
                      title='Customer Churn')

In [None]:
# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Churn','Stay'],
                      title='Customer Churn - Fraction', normalize=True)

In [None]:
metrics = [binary_classifier_metrics(df_test['Churn'], df_test['predicted_class'])]
df_metrics=pd.DataFrame.from_dict(metrics)
df_metrics.index = ['Model']