In [1]:
# Loading library
import pandas as pd
import numpy as np
import itertools

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, ShuffleSplit
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

import xgboost as xgb
from ucimlrepo import fetch_ucirepo 

In [2]:
# fetch dataset 
concrete_compressive_strength = fetch_ucirepo(id=45)

original_df = concrete_compressive_strength.data.original

In [1]:
class test():
    def __init__(self, df):

        self.df = df

        self.performance, self.test_performance = None, None

        self.ratio_list = [1/15, 1/10, 1/5]
        mechans = ["MNAR", "MCAR", "MAR"]
        self.metrics = ["accuracy", "precision", "recall", "f1"]
        
        self.mechan = "Default"

        self.mar_dep_col, self.mar_tar_col, self.mnar_tar_col = "ca", "thal", "cp"
        self.ratio_dict = {"ratio_mcar": 1/30, "ratio_mar": 1/30, "ratio_mnar": 1/30}
        
        self.mul_idx = pd.MultiIndex.from_product(
            [mechans, [f"{r:.3f}" for r in self.ratio_list]],
             names=["mechans", "ratios"]
        )

        # Baseline Performance
        print("Initializing... (Mechan: Default)")
        self.get_dict()
        self.preproc()
        self.baseperformance()
        self.insert_null() # Create missing values first
        self.impute() # Impute the generated missing data frame
        self.cv_eval()
        self.test_eval()
        print("Done")
    
    def preproc(self):
        df = self.df.copy()
        df.loc[df[(df['num'] > 0)].index.tolist(),'num'] = 1

        dropped_df = df.dropna().astype(np.float64)
        num_cols = dropped_df[self.num_li]
        num_cols = (num_cols - num_cols.min()) / (num_cols.max() - num_cols.min())
        dropped_df.loc[:, self.num_li] = num_cols

        self.dropped_df = dropped_df

        X = self.dropped_df.drop('num', axis=1)
        y = self.dropped_df['num']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                            random_state=42)

        self.X_train = X_train.reset_index(drop=True)
        self.X_test = X_test.reset_index(drop=True)
        self.y_train = y_train.reset_index(drop=True)
        self.y_test = y_test.reset_index(drop=True)

    def get_dict(self):
        df = self.df.iloc[:,:-1]
        self.names = df.columns.to_list()

        self.names_dict = {}
        for i, n in enumerate(df.columns.to_list()): self.names_dict[n] = i
            
        self.num_li = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
        self.cat_li = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

        self.num_dict = {n:self.names_dict[n] for n in self.num_li}
        self.cat_dict = {c:self.names_dict[c] for c in self.cat_li}

    def baseperformance(self, md=4, fold=10):
        self.clf = xgb.XGBClassifier(max_depth=md)
        
        scores = cross_validate(self.clf, self.X_train,
                                 self.y_train, cv=fold, 
                                 scoring=self.metrics)
        
        score_frame = pd.DataFrame(scores, index=[f"fold {n + 1}" for n in range(fold)]).iloc[:,2:]
        score_frame.columns = [f"cv_{m}" for m in self.metrics]
        avg_performance = score_frame.mean()
        avg_performance = pd.DataFrame(np.array(avg_performance).reshape(1,4), index=["Avg."], columns=avg_performance.index)

        score_frame = pd.concat([score_frame, avg_performance])
        score_frame = score_frame.map(lambda x: f"{x * 100:.2f}%")

        s = score_frame.style
        s = s.format_index(escape="latex", axis=1)
        res = s.to_latex( 
            position_float="centering",
            hrules=True,
            ).replace(
            '\\end{tabular}',
            '\\end{tabular}\n\\caption{Model training performance table (10 folds Cross validation)}\n\\label{baseline_trainperf}')
        print("".join(res.split("'")))

        self.clf.fit(self.X_train, self.y_train)
        y_pred = self.clf.predict(self.X_test)

        # Evaluate on the test set
        accuracy  = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred)
        recall    = recall_score(self.y_test, y_pred)
        f1        = f1_score(self.y_test, y_pred)

        print(f"Baseline performance on test set:\n Accurarcy: {accuracy * 100: .2f}%, Precision: {precision * 100: .2f}%, Recall: {recall * 100: .2f}%, F1: {f1 * 100: .2f}%.")


    def insert_mcar(self, ratio, df): # Input the percentage of mcar, and the data frame
        
        # Input the length and width of the whole data frame
        def generate_pair(length, width): 
            # generate a pair of numbers, which is the location of the missing entry
            return (np.random.randint(0, length), np.random.randint(0, width)) 
        
        # Number of null values to insert
        num_nulls = int(ratio * (df.size))

        # Randomly choose positions to insert null values
        idx_pair = []

        for _ in range(num_nulls):
            
            p = generate_pair(df.shape[0], df.shape[1])
            while p in idx_pair: p = generate_pair(df.shape[0], df.shape[1])
            idx_pair.append(p)
            
            df.iat[p[0], p[1]] = np.nan
        
        return df
    
    def insert_mar(self, ratio, df, dep_col, tar_col): 
        
        """
        Input:
            - The percentage of data MAR to be generated
            - The data frame
            - Dependent column name: str
            - Index of the target column to be inserted
        """

        # Get the index of entries to be remvoed
        target_idx = df[dep_col] < df[dep_col].quantile(ratio)

        # Remove the entries in the target column
        df.iloc[target_idx, self.names_dict[tar_col]] = np.nan
        
        return df

    def insert_mnar(self, ratio, df, tar_col): 

        """
        Input:
            - The percentage of data MNAR to be generated
            - The data frame
            - Target column name: str
            - Index of the target column to be inserted
        """
        
        # Get the index of entries to be remvoed
        target_idx = df[tar_col] < df[tar_col].quantile(ratio)
        
        # Remove the entries in the target column
        df.iloc[target_idx, self.names_dict[tar_col]] = np.nan
        
        return df

    def insert_null(self):
        """
            Input:
                - The data frame to be inserted
                - Ratios for each missing mechanism
        """
        # self.mechan = mechan if mechan is not None else None
        missing_df = self.X_train.copy()

        # Example
        missing_df = self.insert_mar(self.ratio_dict["ratio_mcar"], missing_df, self.mar_dep_col, self.mar_tar_col)
        missing_df = self.insert_mnar(self.ratio_dict["ratio_mnar"], missing_df, self.mnar_tar_col)
        self.missing_df = self.insert_mcar(self.ratio_dict["ratio_mar"], missing_df)

    def cv(self, md=4, fold=10):
    
        param = {
            "max_depth": md, 
            "eta": 1, 
            "objective": "binary:logistic"
        }

        dtrain = xgb.DMatrix(
            data=self.X_train,
            label=self.y_train
        )
        
        res = xgb.cv(
            param,
            dtrain,
            num_boost_round=10,
            nfold=fold,
            metrics=("error"),
            seed=42,
            as_pandas = True
        )
        res = res * 100
        self.res_format(res)

    def impute(self):
    
        self.imputed_df = self.missing_df.copy()
        
        missing_df_nan_count = self.missing_df.isnull().sum()

        sort_index_li = np.argsort(missing_df_nan_count).tolist()

        # for i in tqdm(sort_index_li, desc=f"Imputing with MNAR ratio: {self.ratio_dict['ratio_mnar']:.3f}, MAR ratio: {self.ratio_dict['ratio_mar']:.3f}, MCAR ratio: {self.ratio_dict['ratio_mcar']:.3f}"):
        for i in sort_index_li:
            temp_df = self.imputed_df.copy()

            target = temp_df.iloc[:, i] # Get the target column

            temp_df = temp_df.iloc[:, temp_df.columns != i] # Kick it out of the data frame

            if target.name in self.num_li:
                self.num_li.remove(target.name)

                # Fill by respective mean
                temp_df[self.num_li] = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(temp_df[self.num_li])

                # Fill by respective mode
                temp_df_imputed = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(temp_df)

                self.num_li.append(target.name)

            elif target.name in self.cat_li:
                self.cat_li.remove(target.name)
                temp_df[self.cat_li] = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(temp_df[self.cat_li])
                temp_df_imputed = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(temp_df)
                self.cat_li.append(target.name)

            # Select out the training set
            y_train = target[target.notnull()]
            x_train = temp_df_imputed[y_train.index, :]

            # Select out the test set
            y_test = target[target.isnull()]
            x_test = temp_df_imputed[y_test.index, :]

            self.imputer = RandomForestRegressor()
            self.imputer.fit(x_train, y_train)
            y_predict = self.imputer.predict(x_test)

            self.imputed_df.iloc[y_test.index, i] = y_predict

    
    def cv_eval(self, ratio=0, md=4, fold=10):
        # Cross Validation Evaluation
        self.clf = xgb.XGBClassifier(max_depth=md)
        scores = cross_validate(self.clf, self.imputed_df,
                                 self.y_train, cv=fold,
                                 scoring=self.metrics)
        
        score_frame = pd.DataFrame(scores).iloc[:,2:]
        self.avg_performance = score_frame.mean()

        if self.performance is None and ratio==0:
            self.performance = pd.DataFrame(np.nan, index=self.mul_idx, columns=[f'avg_cv_{m}' for m in self.metrics])
            base_performance = pd.DataFrame({f'avg_cv_{m}': (self.avg_performance[f'test_{m}'] * 100) for m in self.metrics}, index=[("DEFAULT", f"even {1/30:.3f}")])
            self.performance = pd.concat([self.performance, base_performance])
        elif self.performance is not None and self.mechan is not None and ratio != 0:
            for m in self.metrics:
                self.performance.loc[pd.Index([(self.mechan, f"{ratio:.3f}")]), f'avg_cv_{m}'] = (self.avg_performance[f'test_{m}'] * 100)

    def test_eval(self, ratio=0):
        
        self.clf.fit(self.imputed_df, self.y_train)
        y_pred = self.clf.predict(self.X_test)

        # Evaluate on the test set
        accuracy  = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred)
        recall    = recall_score(self.y_test, y_pred)
        f1        = f1_score(self.y_test, y_pred)

        values = np.array([accuracy, precision, recall, f1])#.reshape(1,4)

        if self.test_performance is None:
            self.test_performance = pd.DataFrame(np.nan, index=self.mul_idx, columns=[f'test_{m}' for m in self.metrics])
            base_performance = pd.DataFrame({f'test_{m}':(v * 100) for m, v in zip(self.metrics, values)}, index=[("DEFAULT", f"even {1/30:.3f}")])
            self.test_performance = pd.concat([self.test_performance, base_performance])
        else:
            self.test_performance.loc[pd.Index([(self.mechan, f"{ratio:.3f}")])] = (values * 100)
    
    def forward(self, mechan):

        self.mechan = mechan
        for r in self.ratio_list:

            if self.mechan == "MNAR":
                self.ratio_dict["ratio_mnar"] = r
            elif self.mechan == "MAR":
                self.ratio_dict["ratio_mar"] = r
            elif self.mechan == "MCAR":
                self.ratio_dict["ratio_mcar"] = r

            self.insert_null() # Create missing values first
            self.impute() # Impute the generated missing data frame
            self.cv_eval(r)
        
            # print(f"Testing...\n(mechan: {self.mechan}, ratio_mnar: {self.ratio_dict["ratio_mnar"]:.3f}, ratio_mar: {self.ratio_dict["ratio_mar"]:.3f}, ratio_mcar: {self.ratio_dict["ratio_mcar"]:.3f})")
            self.test_eval(r) # Evaluate the model performance on test set
            self.reset_ratio()
        
    def reset_ratio(self):
        self.ratio_dict["ratio_mnar"] = 1/30
        self.ratio_dict["ratio_mar"] = 1/30
        self.ratio_dict["ratio_mcar"] = 1/30

    def forward_(self, mar_dep_col="ca", mar_tar_col="thal", mnar_tar_col="cp"):

        self.mar_dep_col, self.mar_tar_col, self.mnar_tar_col = mar_dep_col, mar_tar_col, mnar_tar_col
        print(f"Given mar_dep_col: {self.mar_dep_col}, mar_tar_col: {self.mar_tar_col}, mnar_tar_col: {self.mnar_tar_col}")

        # Evaluate the model performance on training set using CV
        
        # Start from MNAR
        self.forward(mechan="MNAR")

        # Then MCAR
        self.forward(mechan="MCAR")

        # Finally MAR
        self.forward(mechan="MAR")

    def res_format(self, df, caption="", label=""):

            s = df.style.format('{:.2f}\%')
            
            # Highlight the result
            s = s\
                .highlight_max(props='bfseries:;')\
                .format_index(escape="latex", axis=1)

            res = s.to_latex( 
                multirow_align="c",
                position_float="centering",
                clines="skip-last;data",
                hrules=True,
                ).replace(
                '\\end{tabular}',
                '\\end{tabular}\n' + '\\caption{' + f'{caption}' + '}\n\\label{' + f'{label}' + '}')
            
            print("".join(res.split("'")))

    def auto_forward(self, var1="ca", var2="thal", var3="cp"):
        """
        Input:
            - target variables to be inserted missing values

        This auto_forward function would automatically permutate these three variables
        And set each of them as MAR dependent and target column, MNAR target column, respectively.
        """
        vars = [var1, var2, var3]
        iters = itertools.permutations(vars, 3)

        for subset in tqdm(iters):
            try:
                self.forward_(subset[0], subset[1], subset[2])

                print("Setting shown as follows:\n Data MAR: \\texttt{" + f"{self.mar_dep_col}"  + "} determines \\texttt{" + f"{self.mar_tar_col}"  + "} \& Variable MNAR: \\texttt{" + f"{self.mnar_tar_col}" \
                + "}.\nTraining performance: See table \\ref{" + f'trainpef_{self.mar_dep_col}_{self.mar_tar_col}_{self.mnar_tar_col}'\
                + "}.\nTest performance: See table \\ref{" + f'testpef_{self.mar_dep_col}_{self.mar_tar_col}_{self.mnar_tar_col}' + "}.\n")

                self.res_format(self.performance, "Training Performance of setting: MAR (\\texttt{" + f"{self.mar_dep_col}"\
                                + "} determines \\texttt{" + f"{self.mar_tar_col}"  + "}), \\texttt{" + f"{self.mnar_tar_col}"\
                                + "} MNAR", f'trainpef_{self.mar_dep_col}_{self.mar_tar_col}_{self.mnar_tar_col}')
                
                self.res_format(self.test_performance, "Test Performance of setting: MAR (\\texttt{" + f"{self.mar_dep_col}"\
                                + "} determines \\texttt{" + f"{self.mar_tar_col}"  + "}), \\texttt{" + f"{self.mnar_tar_col}"\
                                + "} MNAR", f'testpef_{self.mar_dep_col}_{self.mar_tar_col}_{self.mnar_tar_col}')
            except ValueError:
                print("Something unusual happen. Please rerun.")

In [None]:
zhx = test(original_df)

In [None]:
zhx.auto_forward('ca', 'thal', 'age')