In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import plotly.express as px

In [2]:
import warnings
warnings.filterwarnings('ignore')
import dvc.api
from processing import processing

In [3]:
import numpy as np
# from gittypes import tbd
import pandas as pd
import Plots
import scipy.stats as scs
import random
import math
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image  
import pydotplus
from sklearn import tree
import scipy.stats as stat
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [4]:
csv_path = "../Data/data.csv"

In [5]:
processing = processing()

In [6]:
df= processing.read_csv(csv_path)
df.head()

file read as csv


Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0


In [7]:
def get_data(tag, path='Data/clean_data.csv', repo='https://github.com/daniEL2371/abtest-mlops'):
    rev = tag
    data_url = dvc.api.get_url(path=path, repo=repo, rev=rev)
    df = pd.read_csv(data_url)
    return df

In [8]:

def drop_no_responds(df):
    cleaned_df = df.query("not (yes == 0 & no == 0)")
    return cleaned_df


In [9]:

def read_model(file_name):
    with open(f"../models/{file_name}.pkl", "rb") as f:
        return pickle.load(f)

def write_model(file_name, model):
    with open(f"../models/{file_name}.pkl", "wb") as f:
        pickle.dump(model, f)


In [10]:
cleaned_df = drop_no_responds(df)
cleaned_df['aware'] = cleaned_df['yes'].map(lambda x: x==1)
cleaned_df = cleaned_df.drop(columns = ['yes', 'no', 'auction_id'], axis=1)

In [11]:
def save_cleaned_data():
    CLEANED_CSV_PATH = "../Data/clean_data.csv"
    helper.save_csv(cleaned_df, CLEANED_CSV_PATH)

In [12]:
# label encoding
def encode_labels(df):
    date_encoder = preprocessing.LabelEncoder()
    device_encoder = preprocessing.LabelEncoder()
    browser_encoder = preprocessing.LabelEncoder()
    experiment_encoder = preprocessing.LabelEncoder()
    aware_encoder = preprocessing.LabelEncoder()
    
    df['date'] = date_encoder.fit_transform(df['date'])
    df['device_make'] = device_encoder.fit_transform(df['device_make'])
    df['browser'] = browser_encoder.fit_transform(df['browser'])
    df['experiment'] = experiment_encoder.fit_transform(cleaned_df['experiment'])
    df['browser'] = aware_encoder.fit_transform(df['browser'])
    df['aware'] = aware_encoder.fit_transform(df['aware'])


    
    return df
    
    

In [13]:
# splitting database

def feature_data(cleaned_df):
    
    broweser_df = cleaned_df[["experiment", "hour", "date", 'device_make', 'browser', 'aware']]
    platfrom_df = cleaned_df[["experiment", "hour", "date", 'device_make', 'platform_os', 'aware']]

    return broweser_df, platfrom_df

In [14]:
def save_encoded_df():
    
    broweser_df, platfrom_df = feature_data(encoded_df)
    helper.save_csv(broweser_df, "../Data/clean_data.csv")
    helper.save_csv(platfrom_df, "../Data/clean_data.csv")
    

In [19]:
def loss_function(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    return rmse

In [20]:
class DecisionTreesModel:
    
    def __init__(self, X_train, X_test, y_train, y_test, max_depth=5):
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
        self.clf = DecisionTreeClassifier(max_depth=4)
        
    def train(self, folds=1):
        
        kf = KFold(n_splits = folds)
        
        iterator = kf.split(self.X_train)
        
        loss_arr = []
        acc_arr = []
        for i in range(folds):
            train_index, valid_index = next(iterator)
            
            X_train, y_train = self.X_train.iloc[train_index], self.y_train.iloc[train_index]
            X_valid, y_valid = self.X_train.iloc[valid_index], self.y_train.iloc[valid_index]
                        
            self.clf = self.clf.fit(X_train, y_train)
            
            vali_pred = self.clf.predict(X_valid)
            
            accuracy = self.calculate_score(y_valid
                                              , vali_pred)
            
            loss = loss_function(y_valid, vali_pred)
            
            self.__printAccuracy(accuracy, i, label="Validation")
            self.__printLoss(loss, i, label="Validation")
            print()
            
            acc_arr.append(accuracy)
            loss_arr.append(loss)

            
        return self.clf, acc_arr, loss_arr
    
    def test(self):
        
        y_pred = self.clf.predict(self.X_test)
        
        accuracy = self.calculate_score(y_pred, self.y_test)
        self.__printAccuracy(accuracy, label="Test")
        
        report = self.report(y_pred, self.y_test)
        matrix = self.confusion_matrix(y_pred, self.y_test)
        
        loss = loss_function(self.y_test, y_pred)
        
        return accuracy, loss,  report, matrix
    
    def get_feature_importance(self):
        importance = self.clf.feature_importances_
        fi_df = pd.DataFrame()
        
        fi_df['feature'] = self.X_train.columns.to_list()
        fi_df['feature_importances'] = importance
        
        return fi_df
    
    def __printAccuracy(self, acc, step=1, label=""):
        print(f"step {step}: {label} Accuracy of DecisionTreesModel is: {acc:.3f}")
    
    def __printLoss(self, loss, step=1, label=""):
        print(f"step {step}: {label} Loss of DecisionTreesModel is: {loss:.3f}")
    
    def calculate_score(self, pred, actual):
        return metrics.accuracy_score(actual, pred)
    
    def report(self, pred, actual):
        print("Test Metrics")
        print("================")
        print(metrics.classification_report(pred, actual))
        return metrics.classification_report(pred, actual)
    
    def confusion_matrix(self, pred, actual):
        ax=sns.heatmap(pd.DataFrame(metrics.confusion_matrix(pred, actual)))
        plt.title('Confusion matrix')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        return metrics.confusion_matrix(pred, actual)


        
    
    
    

In [32]:
from typing_extensions import Protocol
from typing import TypeGuard
platform_df = get_data('enc-platform-df')

browser_df = get_data('enc-browser-df')


ImportError: cannot import name 'TypeGuard' from 'typing' (C:\Users\Maelaf ES\anaconda3\lib\typing.py)

In [33]:
print("1. Encoded Dataframe containing the the platfrom column")
platform_df.head()

1. Encoded Dataframe containing the the platfrom column


NameError: name 'platform_df' is not defined

In [34]:
print("2. Encoded Dataframe containing the the browser column")
browser_df.head()


2. Encoded Dataframe containing the the browser column


NameError: name 'browser_df' is not defined

In [96]:
# import mlflow
# import datetime
# Current_Date = datetime.datetime.today()

# mlflow.set_experiment('ML_Approach_ABTEST-' + str(Current_Date))

In [97]:
# feature_cols = ["experiment", "hour", "date", 'device_make', "platform_os",  "browser"]
feature_cols = ["experiment", "hour", "date", 'device_make', 'browser']

X = browser_df[feature_cols]
y = browser_df[['aware']]

test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

NameError: name 'browser_df' is not defined

In [None]:
decisionTreesModel = DecisionTreesModel(X_train, X_test,  y_train, y_test)

folds = 5
clf, acc_arr, loss_arr = decisionTreesModel.train(folds)

write_model('browser_decision_tree_model', clf)

In [None]:

test_acc, loss, report, confusion_matrix = decisionTreesModel.test()
print(f"Loss on test data is: {loss:.3f}")
print(f"Test accuracy on test data is: {test_acc:.3f}")

print()


In [None]:
decisionTreesModel.get_feature_importance()

In [None]:
dot_data = StringIO()

leaves_parallel=False
feature_cols = ["experiment", "hour", "date", 'device_make', 'browser']
out_put_file = "AbTestDecisionTree.dot"

dot_data = export_graphviz(clf, out_file=out_put_file,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['Aware','Not Aware'])

graph = pydotplus.graphviz.graph_from_dot_file(out_put_file)
graph.write_png('AbTestDecisionTree.png')

Image(graph.create_png())


In [None]:
# feature_cols = ["experiment", "hour", "date", 'device_make', "platform_os",  "browser"]
feature_cols = ["experiment", "hour", "date", 'device_make', 'platform_os']

X = platform_df[feature_cols]
y = platform_df[['aware']]

test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [None]:
decisionTreesModel = DecisionTreesModel(X_train, X_test,  y_train, y_test)

folds = 5
clf, acc_arr, loss_arr = decisionTreesModel.train(folds)

write_model('platform_os_decision_tree_model', clf)

In [None]:
test_acc, loss, report, confusion_matrix = decisionTreesModel.test()
print(f"Loss on test data is: {loss:.3f}")
print()

In [None]:
decisionTreesModel.get_feature_importance()

In [None]:
dot_data = StringIO()

leaves_parallel=False
feature_cols = ["experiment", "hour", "date", 'device_make', 'browser']
out_put_file = "AbTestDecisionTree.dot"

dot_data = export_graphviz(clf, out_file=out_put_file,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['Aware','Not Aware'])

graph = pydotplus.graphviz.graph_from_dot_file(out_put_file)
graph.write_png('AbTestDecisionTree.png')
Image(graph.create_png())

In [None]:
class LogesticRegressionModel:
    
    def __init__(self, X_train, X_test, y_train, y_test, model_name="LR"):
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.model_name = model_name
        
        self.clf = LogisticRegression()
        
    def train(self, folds=1):
        
        kf = KFold(n_splits = folds)
        
        iterator = kf.split(self.X_train)
        
        loss_arr = []
        acc_arr = []
        model_name= self.model_name
#         mlflow.end_run()
        for i in range(folds):

            train_index, valid_index = next(iterator)

            X_train, y_train = self.X_train.iloc[train_index], self.y_train.iloc[train_index]
            X_valid, y_valid = self.X_train.iloc[valid_index], self.y_train.iloc[valid_index]

            self.clf = self.clf.fit(X_train, y_train)

            vali_pred = self.clf.predict(X_valid)

            accuracy = self.calculate_score(y_valid, vali_pred)
            loss = loss_function(y_valid, vali_pred)

            self.__printAccuracy(accuracy, i, label="Validation")
            self.__printLoss(loss, i, label="Validation")
            print()

            acc_arr.append(accuracy)
            loss_arr.append(loss)
            
        return self.clf, acc_arr, loss_arr
    
    def test(self):
        y_pred = self.clf.predict(self.X_test)
        
        accuracy = self.calculate_score(self.y_test, y_pred)
        self.__printAccuracy(accuracy, label="Test")
        
        report = self.report(y_pred, self.y_test)
        matrix = self.confusion_matrix(y_pred, self.y_test)
        loss = loss_function(self.y_test, y_pred)
        
        return accuracy, loss, report, matrix 
    
    def __printAccuracy(self, acc, step=1, label=""):
        print(f"step {step}: {label} Accuracy of LogesticRegression is: {acc:.3f}")
    
    def __printLoss(self, loss, step=1, label=""):
        print(f"step {step}: {label} Loss of LogesticRegression is: {loss:.3f}")
    
    def calculate_score(self, pred, actual):
        return metrics.accuracy_score(actual, pred)
    
    def report(self, pred, actual):
        print("Test Metrics")
        print("================")
        print(metrics.classification_report(pred, actual))
        return metrics.classification_report(pred, actual)
    
    def confusion_matrix(self, pred, actual):
        ax=sns.heatmap(pd.DataFrame(metrics.confusion_matrix(pred, actual)))
        plt.title('Confusion matrix')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        return metrics.confusion_matrix(pred, actual)
    
    def get_p_values(self):
        """ 
        Calcualting p_values for logestic regression.
        code refered from the following link
        https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d
        
        """
        denom = (2.0*(1.0+np.cosh(self.clf.decision_function(X))))
        denom = np.tile(denom,(X.shape[1],1)).T
        F_ij = np.dot((X/denom).T,X) ## Fisher Information Matrix
        Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
        sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
        z_scores = self.clf.coef_[0]/sigma_estimates # z-score 
        p_values = [stat.norm.sf(abs(x)) for x in z_scores] ### two tailed test for p-values
        
        p_df = pd.DataFrame()
        p_df['features'] = self.X_train.columns.to_list()
        p_df['p_values'] = p_values
        
        return p_df
    
    def plot_pvalues(self, p_df):
        
        fig, ax = plt.subplots(figsize=(12,7))

        ax.plot([0.05,0.05], [0.05,5])
        sns.scatterplot(data=p_df, y='features', x='p_values', color="green")
        plt.title("P values of features", size=20)

        plt.xticks(np.arange(0,max(p_df['p_values']) + 0.05, 0.05))

        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)

        plt.show()
        return fig

In [None]:
feature_cols = ["experiment", "hour", "date", 'device_make', 'browser']

X = browser_df[feature_cols]
y = browser_df[['aware']]

test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [None]:
logesticRegressionModel = LogesticRegressionModel(X_train, X_test,  y_train, y_test)

folds = 5
clf2, loss_arr_2, acc_arr_2 = logesticRegressionModel.train(folds)

write_model('browser_Logestic_Reg_model', clf2)


In [None]:
test_acc2, test_loss2, report2, matrix2  = logesticRegressionModel.test()

In [None]:
p_values_df = logesticRegressionModel.get_p_values()
p_value_fig = logesticRegressionModel.plot_pvalues(p_values_df)
p_values_df

In [None]:
feature_cols = ["experiment", "hour", "date", 'device_make', 'platform_os']

X = platform_df[feature_cols]
y = platform_df[['aware']]

test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [None]:
logesticRegressionModel = LogesticRegressionModel(X_train, X_test,  y_train, y_test)

folds = 5
clf2, loss_arr_2, acc_arr_2 = logesticRegressionModel.train(folds)

write_model('platform_os_Logestic_Reg_model', clf2)


In [None]:
test_acc2, test_loss2, report2, matrix2  = logesticRegressionModel.test()

In [None]:
p_values_df = logesticRegressionModel.get_p_values()
p_value_fig = logesticRegressionModel.plot_pvalues(p_values_df)
p_values_df

In [None]:
class XGBClassifierModel:
    
    def __init__(self, X_train, X_test, y_train, y_test, max_depth=5):
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
        self.clf = GradientBoostingClassifier()

        
    def train(self, folds=1):
        
        kf = KFold(n_splits = folds)
        
        iterator = kf.split(self.X_train)
        
        loss_arr = []
        acc_arr = []
        for i in range(folds):
            train_index, valid_index = next(iterator)
            
            X_train, y_train = self.X_train.iloc[train_index], self.y_train.iloc[train_index]
            X_valid, y_valid = self.X_train.iloc[valid_index], self.y_train.iloc[valid_index]
                        
            self.clf = self.clf.fit(X_train, y_train)
            
            vali_pred = self.clf.predict(X_valid)
            
            accuracy = self.calculate_score(y_valid
                                              , vali_pred)
            
            loss = loss_function(y_valid, vali_pred)
            
            self.__printAccuracy(accuracy, i, label="Validation")
            self.__printLoss(loss, i, label="Validation")
            print()
            
            acc_arr.append(accuracy)
            loss_arr.append(loss)

            
        return self.clf, acc_arr, loss_arr
    
    def test(self):
        
        y_pred = self.clf.predict(self.X_test)
        
        accuracy = self.calculate_score(y_pred, self.y_test)
        self.__printAccuracy(accuracy, label="Test")
        
        report = self.report(y_pred, self.y_test)
        matrix = self.confusion_matrix(y_pred, self.y_test)
        
        loss = loss_function(self.y_test, y_pred)
        
        return accuracy, loss,  report, matrix
    
    def get_feature_importance(self):
        importance = self.clf.feature_importances_
        fi_df = pd.DataFrame()
        
        fi_df['feature'] = self.X_train.columns.to_list()
        fi_df['feature_importances'] = importance
        
        return fi_df
    
    def __printAccuracy(self, acc, step=1, label=""):
        print(f"step {step}: {label} Accuracy of DecisionTreesModel is: {acc:.3f}")
    
    def __printLoss(self, loss, step=1, label=""):
        print(f"step {step}: {label} Loss of DecisionTreesModel is: {loss:.3f}")
    
    def calculate_score(self, pred, actual):
        return metrics.accuracy_score(actual, pred)
    
    def report(self, pred, actual):
        print("Test Metrics")
        print("================")
        print(metrics.classification_report(pred, actual))
        return metrics.classification_report(pred, actual)
    
    def confusion_matrix(self, pred, actual):
        ax=sns.heatmap(pd.DataFrame(metrics.confusion_matrix(pred, actual)))
        plt.title('Confusion matrix')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        return metrics.confusion_matrix(pred, actual)


In [None]:
# feature_cols = ["experiment", "hour", "date", 'device_make', "platform_os",  "browser"]
feature_cols = ["experiment", "hour", "date", 'device_make', 'browser']

X = browser_df[feature_cols]
y = browser_df[['aware']]

test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [None]:
xGBClassifierModel = XGBClassifierModel(X_train, X_test,  y_train, y_test)

folds = 5
clf3, acc_arr, loss_arr = xGBClassifierModel.train(folds)

write_model('platform_os_XGBoost_model', clf3)

In [None]:
test_acc, loss, report, confusion_matrix = xGBClassifierModel.test()
print(f"Loss on test data is: {loss:.3f}")
print()

In [None]:
xGBClassifierModel.get_feature_importance()

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'criterion': ['gini','entropy'], 'max_depth':[4,5,6,7,8,9,10]}

kfold = KFold(n_splits=5)
gridSearch = GridSearchCV(estimator=clf, param_grid=params, n_jobs=-1,  cv=kfold, scoring="neg_root_mean_squared_error")


import mlflow
import datetime
Current_Date = datetime.datetime.today()

mlflow.set_experiment('DecisionTree-' + str(Current_Date))
mlflow.sklearn.autolog()
with mlflow.start_run(run_name='DT-Hyperparameter') as run:
        searchResults = gridSearch.fit(X_train, y_train)
        
        pred=searchResults.predict(X_test)
        loss=loss_function(y_test,pred)
        acc = metrics.accuracy_score(y_test, pred)
        
        mlflow.log_param('Features', X_train.columns.to_list())
        mlflow.log_param('Target', y_train.columns.to_list())
        mlflow.log_param('Number Of Training Dataset', X_train.shape[0])
        mlflow.log_param('Number Of Test Dataset', X_test.shape[0])
        mlflow.log_param('Fold number', folds)
                      
        

        mlflow.log_metric("loss", loss)
        mlflow.log_metric("accuracy", acc)



best_dt_Model = searchResults.best_estimator_

In [None]:
from sklearn.model_selection import GridSearchCV

params = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
kfold = KFold(n_splits=5)

mlflow.sklearn.autolog()
gridSearch = GridSearchCV(estimator=clf2, param_grid=params, n_jobs=-1,  cv=kfold, scoring="neg_root_mean_squared_error")

mlflow.set_experiment('LogisticRegression-' + str(Current_Date))
with mlflow.start_run(run_name='LR-Hyperparameter') as run:
        searchResults = gridSearch.fit(X_train, y_train)
        
        pred=searchResults.predict(X_test)
        loss=loss_function(y_test,pred)
        acc = metrics.accuracy_score(y_test, pred)
        
        mlflow.log_param('Features', X_train.columns.to_list())
        mlflow.log_param('Target', y_train.columns.to_list())
        mlflow.log_param('Number Of Training Dataset', X_train.shape[0])
        mlflow.log_param('Number Of Test Dataset', X_test.shape[0])
        mlflow.log_param('Fold number', folds)

        mlflow.log_metric("loss", loss)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_figure(p_value_fig, 'p_values.png')



best_lr_model = searchResults.best_estimator_

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators': [20, 40, 60, 80]}

kfold = KFold(n_splits=5)
gridSearch = GridSearchCV(estimator=clf3, param_grid=params, n_jobs=-1,  cv=kfold, scoring="neg_root_mean_squared_error")


import mlflow
import datetime
Current_Date = datetime.datetime.today()

mlflow.set_experiment('XGBoost-' + str(Current_Date))
mlflow.sklearn.autolog()
with mlflow.start_run(run_name='XGBoost-Hyperparameter') as run:
        searchResults = gridSearch.fit(X_train, y_train)
        
        pred=searchResults.predict(X_test)
        loss=loss_function(y_test,pred)
        acc = metrics.accuracy_score(y_test, pred)
        
        mlflow.log_param('Features', X_train.columns.to_list())
        mlflow.log_param('Target', y_train.columns.to_list())
        mlflow.log_param('Number Of Training Dataset', X_train.shape[0])
        mlflow.log_param('Number Of Test Dataset', X_test.shape[0])
        mlflow.log_param('Fold number', folds)
                      
        

        mlflow.log_metric("loss", loss)
        mlflow.log_metric("accuracy", acc)



best_dt_Model = searchResults.best_estimator_