In [None]:
class ml_model_plotter():
    '''
    Class to plot machine learning model results
    '''
    #####
    @staticmethod
    def train_val_plot(ml_model, figsize = (14, 6)):
        '''
        It plots training scores vs validation scores. It returns a figure
        '''
        fig = plt.figure(figsize = figsize)
        sns.set_theme()

        sns.lineplot(data = [ml_model.train_scores, ml_model.val_scores], markers = True, dashes = False)

        plt.ylabel("Score")
        plt.xlabel("Round")
        plt.legend(["Train score", "Validation score"])
        
        return fig

    #####
    @staticmethod
    def test_metrics(ml_model, figsize = (12, 12)):
        '''
        It plots the metrics after training with the full training data and testing with the test data. It returns a figure
        '''
        # Calculate the row/column totals for later use
        row_sums = ml_model.cm.sum(axis = 1, keepdims = True)
        column_sums = ml_model.cm.sum(axis = 0, keepdims = True)
        
        # Relative values to column/row sums
        rel_row = (ml_model.cm / row_sums) * 100
        rel_col = (ml_model.cm / column_sums) * 100

        # Plot
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize = figsize, sharex = True, sharey = True)

        first_row_palette = sns.color_palette("light:b", as_cmap=True)
        second_row_palette = sns.light_palette("seagreen", as_cmap=True)
        fmt = "g"

        # ax1
        sns.heatmap(ml_model.cm, annot = True, linewidths = .1, cmap = first_row_palette, ax = ax1, cbar = False, fmt = fmt)
        ax1.set_ylabel("Actual class")
        ax1.set_title("Confusion matrix")

        # ax2
        sns.heatmap((ml_model.cm / ml_model.cm.sum()) * 100, annot = True, linewidths = .1, cmap = first_row_palette, ax = ax2, cbar = False, fmt = fmt)
        ax2.set_ylabel("Actual class")
        ax2.set_title("Confusion matrix - relative")

        # ax3
        sns.heatmap(rel_row, annot = True, linewidths = .1, cmap = second_row_palette, ax = ax3, cbar = False, fmt = fmt)
        ax3.set_xlabel("Predicted class")
        ax3.set_title("Relative to row sum (Recall)")

        # ax4
        sns.heatmap(rel_col, annot = True, linewidths = .1, cmap = second_row_palette, ax = ax4, cbar = False, fmt = fmt)
        ax4.set_xlabel("Predicted class")
        ax4.set_title("Relative to col sum (Precision)")

        return fig

In [None]:
class eda_plotter():
    #####
    @staticmethod
    def __n_rows(df, n_columns):
        '''
        It calculates the number of rows (for the axes) depending on the number of variables to plot and the columns we want for the figure.
        args:
        n_columns: number of columns
        '''
        columns = list(df.columns)

        if len(columns) % n_columns == 0:
            axes_rows = len(columns) // n_columns
        else:
            axes_rows = (len(columns) // n_columns) + 1

        return axes_rows

    #####
    @staticmethod
    def rows_plotter(df, features_names, n_columns, kind = "box", figsize = (12, 6)):
        '''
        It plots all the variables in one row. It returns a figure
        args:
        n_columns: number of columns for the row
        kind: ("strip", "dist", "box")
        figsize: size of the figure
        '''
        # creates a figure with one axis and n_columns
        fig, axes = plt.subplots(1, n_columns, figsize = figsize)
        count = 0

        # Loop thorugh the generated axes
        for column in range(n_columns):
            if kind == "strip":
                sns.stripplot(y = df.iloc[:, count], ax = axes[column])
            elif kind == "dist":
                sns.distplot(df.iloc[:, count], ax = axes[column])
            elif kind == "box":
                sns.boxplot(df.iloc[:, count], ax = axes[column])
            else:
                sns.histplot(df.iloc[:, count], ax = axes[column], bins = 30)

            try:
                axes[column].set(xlabel = features_names[count])
            except:
                pass

            if (count + 1) < df.shape[1]:
                    count += 1
            else:
                break

        return fig

    #####
    @staticmethod
    def multi_axes_plotter(df, features_names, n_columns, kind = "box", figsize = (12, 12)):
        '''
        It creates a plot with multiple rows and columns. It returns a figure.
        n_columns: number of columns for the row
        kind: ("strip", "dist", "box")
        figsize: size of the figure
        '''
        # Calculating the number of rows from number of columns and variables to plot
        n_rows_ = eda_plotter.__n_rows(df, n_columns)

        # Creating the figure and as many axes as needed
        fig, axes = plt.subplots(n_rows_, n_columns, figsize = figsize)
        # To keep the count of the plotted variables
        count = 0

        # Some transformation, because with only one row, the shape is: (2,)
        axes_col = axes.shape[0]
        try:
            axes_row = axes.shape[1]
        except:
            axes_row = 1

        # Loop through rows
        for row in range(axes_col):
            # Loop through columns
            for column in range(axes_row):
                if kind == "strip":
                    sns.stripplot(y = df.iloc[:, count], ax = axes[row][column])
                elif kind == "dist":
                    sns.distplot(df.iloc[:, count], ax = axes[row][column])
                elif kind == "box":
                    sns.boxplot(df.iloc[:, count], ax = axes[row][column])
                else:
                    sns.histplot(df.iloc[:, count], ax = axes[row][column], bins = 30)

                try:
                    axes[row][column].set(xlabel = features_names[count])
                except:
                    pass

                if (count + 1) < df.shape[1]:
                    count += 1
                else:
                    break
        return fig

    #####
    @staticmethod
    def correlation_matrix(df, features_names, figsize = (12, 12)):
        '''
        It plots a correlation matrix. It returns a figure
        '''
        fig = plt.figure(figsize = figsize)
        sns.heatmap(df.corr(), annot = True, linewidths = .1,
                    cmap = "Blues", xticklabels = False,
                    yticklabels = features_names, cbar = False)

        return fig

In [None]:
import pandas as pd
import numpy as np

import re
from varname import nameof

import requests
from bs4 import BeautifulSoup
import html
import lxml

import json
import joblib

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

from imblearn.over_sampling import SMOTE

import sys, os

class dataset:
    '''
    Object that will hold information about dataframe as well as do some useful transformations and save a copy in case we need to go back to the unprocessed version of the dataframe
    '''
    def __init__(self):
        # Raw data
        self.__dfs_list = []
        self.__joined_dfs = {}
        self.__raw_df = None
        self.df = None

        # Processed data for ML
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.kfold = None

    ######### DATA PROCESSING #########
    #########
    def __read_data(self, data_path):
        '''
        It reads all the files from a folder as dataframes, and saves them all in a dict with the name of the file as a key.
        args:
        up_levels: steps to go up from current folder
        folder: where the files are located
        '''
        data_dfs = {}
        for file_ in os.listdir(data_path):
            if file_ != "history":
                try:
                    # Path to file
                    filepath = data_path + sep + file_

                    # Reading as dataframe
                    df = pd.read_csv(filepath, index_col = 0)
                    df["SEQN"] = df["SEQN"].map(int)
                    df.set_index("SEQN", inplace = True)

                    # Saving it in a dictionary
                    dict_key = file_[:-4].lower()
                    data_dfs[dict_key] = df
                except:
                    pass

        return data_dfs

    #########
    def __read_all_data(self, data_path, folders):
        '''
        It does the same as __read_data but for several folders at the same time
        args: same as __read_data
        '''
        for folder in folders:
            folder_path = data_path + folder
            self.__dfs_list.append(self.__read_data(folder_path))

    #########
    def __concatenate_dfs(self, data_dfs):
        '''
        It receives a dict of dataframes and combines them by name
        args:
        data_dfs: dict with filename as key and dataframe as value
        '''
        files = {}
        count = 0

        for key, dfs in data_dfs.items():
            key_ = key[:-2]

            if count == 0:
                files[key_] = dfs
            else:
                if key_ not in files.keys():
                    files[key_] = dfs
                else:
                    files[key_] = pd.concat([files[key_], dfs])

            count +=1

        return files

    #########
    def __concatenate_all_dfs(self):
        '''
        It does the same as __concatenate_dfs but for multiple dicts
        '''
        for data_dfs in self.__dfs_list:
            files = self.__concatenate_dfs(data_dfs)
            self.__joined_dfs = {**self.__joined_dfs, **files}


    #########
    def __merge_dfs(self):
        '''
        It combines all dfs processed into one
        '''
        keys = list(self.__joined_dfs.keys())
        self.df = self.__joined_dfs.pop(keys[0])

        for name, df in self.__joined_dfs.items():
            self.df = pd.merge(self.df, df, how = "outer", on = "SEQN")
            
    #########
    def __clean_rows(self):
        '''
        It removes values (rows) of no interest for specific columns. Values such as 7 or 9 that represent either "No answer" or "No info"
        '''
        important_values = [7.0, 9.0]
        # Asthma
        self.df = self.df[~self.df.MCQ010.isin(important_values)]
        # Heart problems
        self.df = self.df[~self.df.MCQ160B.isin(important_values)]
        self.df = self.df[~self.df.MCQ160C.isin(important_values)]
        self.df = self.df[~self.df.MCQ160D.isin(important_values)]
        self.df = self.df[~self.df.MCQ160E.isin(important_values)]
        self.df = self.df[~self.df.MCQ160F.isin(important_values)]

    def __update_target_values(self):
        '''
        It replaces the 2s with 0s for potential target variables
        '''
        self.df.MCQ010 = self.df.MCQ010.replace(2, 0)
        self.df.MCQ160B = self.df.MCQ160B.replace(2, 0)
        self.df.MCQ160C = self.df.MCQ160C.replace(2, 0)
        self.df.MCQ160D = self.df.MCQ160D.replace(2, 0)
        self.df.MCQ160E = self.df.MCQ160E.replace(2, 0)
        self.df.MCQ160F = self.df.MCQ160F.replace(2, 0)

    #########
    def __clean_columns(self, correction_map):
        '''
        It removes duplicated columns.
        args:
        correction_map: dict which keys are the columns to rename and the values are the new names for those columns
        '''
        to_drop = [key[:-2] + "_y" for key in correction_map.keys()]
        self.df = self.df.drop(to_drop, axis = 1)
        self.df = self.df.rename(columns = correction_map)

    #########
    def __heart_disease(self):
        '''
        It creates a new column using all cardiovascular-related ones as source. The objective is to have a new column where we can see if the participant has any kind of heart disease.
        '''
        # We create the column and fill it in with NaN values, as the initial status (with no information) is that we don't know whether someone has o doesn't have a coronary disease
        self.df["MCQ160H"] = np.nan

        # Conditions to filter by any heart disease
        pos_cond_b = self.df.MCQ160B == 1
        pos_cond_c = self.df.MCQ160C == 1
        pos_cond_d = self.df.MCQ160D == 1
        pos_cond_e = self.df.MCQ160E == 1
        pos_cond_f = self.df.MCQ160F == 1

        # For those participants we do have the info for and we know they don't have any coronary disease
        neg_cond_b = self.df.MCQ160B == 0
        neg_cond_c = self.df.MCQ160C == 0
        neg_cond_d = self.df.MCQ160D == 0
        neg_cond_e = self.df.MCQ160E == 0
        neg_cond_f = self.df.MCQ160F == 0

        # Given the positive conditions, place a "1" in the column if they are matched
        self.df.loc[(pos_cond_b) | (pos_cond_c) | (pos_cond_d) | (pos_cond_e) | (pos_cond_f), "MCQ160H"] = 1
        # Given the negative conditions, place a "0" in the column if they are matched
        self.df.loc[(neg_cond_b) & (neg_cond_c) & (neg_cond_d) & (neg_cond_e) & (neg_cond_f), "MCQ160H"] = 0

    #########
    def load_data(self, data_path, folders, correction_map):
        '''
        It combines all previous steps to get clean and ready-to-use data
        '''
        self.__read_all_data(data_path, folders)
        self.__concatenate_all_dfs()
        self.__merge_dfs()
        self.__clean_rows()
        self.__update_target_values()
        self.__clean_columns(correction_map)
        self.__heart_disease()
        # Dataset backup
        self.__raw_df = self.df
    
    ######### SUPPORT FUNCTIONS #########
    #########
    def filter_columns(self, features, inplace = False):
        '''
        It filters the dataframe.
        args:
        features: columns we want to filter by
        inplace: default = False. If True, it will modify the dataframe within the object.
        '''
        if inplace:
            self.df = self.df.loc[:, features]
        else:
            return self.df.loc[:, features]

    #########
    def drop_columns(self, columns):
        '''
        To drop columns
        '''
        self.df = self.df.drop(columns, axis = 1)

    #########
    def drop_nans(self):
        '''
        To drop nans
        '''
        self.df = self.df.dropna()

    #########
    def dummies_transform(self, variable, mapper):
        '''
        Transforms categorical variables into dummies.
        args:
        variable: target column to be transformed
        mapper: To preprocess the values before transforming the column into dummies.
        '''
        # Mapping values
        self.df.loc[:, variable] = self.df.loc[:, variable].map(mapper)
        # Getting dummies
        self.df = pd.get_dummies(self.df, prefix = "", prefix_sep = "", columns = [variable])
        #return df

    #########
    def __pair_mean(self, pair_list, new_name, drop_old = False):
        '''
        It creates a new column by calculating the mean of two other.
        args:
        pair_list: columns to calculate the mean of
        new_name: name for the new column
        drop_old: set to False by default. If True, it will remove the columns we used to calculated the mean of
        '''
        self.df[new_name] = self.df.loc[:, pair_list].mean(axis = 1)
        
        if drop_old:
            self.df = self.df.drop(pair_list, axis = 1)

    #########
    def pairs_mean(self, combination_list, drop_old = False):
        '''
        It does the same as __pair_mean but for several pairs at once.
        args:
        combination_list: [[var1, var2], new_var]
        drop_old: By default set to False. If True, it will remove the variables used to calculated the mean.
        '''
        for combination in combination_list:
            self.__pair_mean(combination[0], combination[1], drop_old = drop_old)

    #########
    def reset_dataset(self):
        '''
        In case we want to restore the dataset to its first status (when used load_data method)
        '''
        self.df = self.__raw_df

    #########
    def model_data(self, split, cv, epochs = 1, scaler = False, balance = None, seed = 42): 
        '''
        It allows us to prepare the data for Machine Learning training
        '''
        # Independent variables
        X = np.array(self.df.iloc[:, 1:])

        # Dependent variable
        y = np.array(self.df.iloc[:, 0])

        # Data scaling
        if scaler:
            scaler = StandardScaler()
            X = scaler.fit_transform(X)

        # Train-test
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size = split, random_state = seed)

        # Balancing data
        if balance != None:
            sm = SMOTE(sampling_strategy = balance, random_state = seed, n_jobs = -1)
            self.X_train, self.y_train = sm.fit_resample(self.X_train, self.y_train)

        # Cross validation
        self.kfold = RepeatedStratifiedKFold(n_splits = cv, n_repeats = epochs, random_state = seed)