In [2]:
import pandas as pd
import numpy as np
from random import sample,seed

np.random.seed(42)
random_state = np.random.RandomState(42)
seed(a=42)

df1 = pd.read_csv('adult.csv')
#Sampling the dataframe just because the computing time was pretty high
df1=df1.loc[np.sort(sample(list(df1.index), 1000))]
df2 = pd.read_csv('bands.csv')
df2=df2.loc[np.sort(sample(list(df2.index), 200))]
df3 = pd.read_csv('breast.csv').reset_index().drop("index",axis=1)
df4 = pd.read_csv('crx.csv')
df5 = pd.read_csv('hepatitis.csv')
df6 = pd.read_csv('horse-colic.csv')
df7 = pd.read_csv('housevotes.csv').reset_index().drop("index",axis=1)
df8 = pd.read_csv('mammographic.csv')
df9 = pd.read_csv('mushroom.csv')
df9=df9.loc[np.sort(sample(list(df9.index), 1000))]
df10 = pd.read_csv('wisconsin.csv')

#Looking for -1 values in the vaues of the class
DFs=[df1,df2,df3,df4,df5,df6,df7,df8,df9,df10]
for df in DFs:
    print(df[df==-1].any()[df[df==-1].any().values==True].index)
#Luckily any has this little problem


#Function to convert categorical variables to numerical using label encoding

def cat_to_num(df):
    #Expresing nans properly
    df=df.replace('?', np.nan)
    df=df.replace("<null>",np.nan)
    df=df.replace(" <null>",np.nan)

    #To replace spaces from the column names
    columns=[]
    for string in list(df.columns):
        columns.append(string.strip())
    columns[-1]= "Class"
    df.columns= columns

    #Transform to numeric the columns that requires it
    df=df.apply(pd.to_numeric, errors='ignore')
    
    
    #List of categories variables 
    cv_df=list(df.dtypes[df.dtypes == "object"].index)
    
    #Categorical encoding to numeric 
    for name in cv_df:
        df[name] = df[name].astype('category').cat.codes
    
    
    #Recovering the nans 
    df=df.replace(-1, np.nan)

    #To compute de proportion of missing values
    #sum(df.isnull().sum())/(df.shape[0]*df.shape[1])*100
    
    return df

from sklearn.base import BaseEstimator
from sklearn.utils import check_random_state
from sklearn.utils.multiclass import unique_labels
from sklearn.impute import KNNImputer

#Class to impute missing values

class MissingValueImputation(BaseEstimator):
    
    def __init__(self, method = 'mean', k = 5, 
                 random_state = None):
        self.method = method
        self.k = k
        self.random_state = random_state
        
    def __mean_imputation(self, X):
        self.means_ = np.nanmean(X, axis = 0)
        return self
    
    def __natural_imputation(self, X, y):
        labels = unique_labels(y)
        for label in labels:
            idx = y == label
            X_tmp = X[idx]
            mean_tmp = np.nanmean(X_tmp, axis = 0)
            self.means_ = {'label': label,
                           'means': mean_tmp}
            #print("self",self.means_)
        return self
    
    def __k_means_imputation(self, X):
        pass
    
    def __knn_imputation(self,X):
        imputer = KNNImputer(n_neighbors=self.k)
        self.knni_trans = imputer.fit_transform(X)
        return self
    
    def __wknn_imputation(self,X):
        imputer = KNNImputer(n_neighbors=self.k,weights="distance")
        self.wknni_trans = imputer.fit_transform(X)
        return self
    
    
    def fit(self, X, y):
        if self.method == 'mean':
            self.__mean_imputation(X)
        elif self.method == 'natural':
            self.__natural_imputation(X, y)
        elif self.method == 'knni':
            self.__knn_imputation(X)
        elif self.method == 'wknni':
            self.__wknn_imputation(X)
        elif self.method == 'kmeans':
            self.__k_means_imputation(X)
        else:
            raise ValueError('Unrecognized method')
        
        return self
    
    def transform(self, X):
        n_samples, n_features = X.shape
        X_ = np.copy(X)
        
        if self.method == 'mean':
            for feature_index in range(n_features):
                idx_nan = np.isnan(X[:, feature_index])
                X_[idx_nan, feature_index] = self.means_[feature_index]
        elif self.method=='knni':
            X_=self.knni_trans
        
        elif self.method=='wknni':
            X_=self.wknni_trans
                    
        elif self.method=='kmeans': 
            self.__k_means_imputation(X)
        
        elif self.method == 'natural':
            for feature_index in range(n_features):
                idx_nan = np.isnan(X[:, feature_index])
                X_[idx_nan, feature_index] = self.means_["means"][feature_index]
                
        else:
            raise ValueError('Unrecognized method')
            
        return X_  
    
    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)

from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

def micro_roc_auc(y_test,y_score):
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    return roc_auc["micro"]

def classification(X,y,X_train,y_train,X_test,y_test,clf):
    classes=list(set(y))
    if len(classes) == 2:
        classifier = clf
        y_score = classifier.fit(X_train, y_train).predict(X_test)
        roc_score=roc_auc_score(y_test, y_score)
    #Multilabel classification, this is a prototipe, there are some issues that can be fixed in the future
    #in this case, I choose only binary clasification datasets
    else:
        print("Not binary")
        y = label_binarize(y, classes=classes)    
        n_classes = y.shape[1]

        random_state = np.random.RandomState(0)
        n_samples, n_features = X.shape

        classifier = OneVsRestClassifier(clf)
        y_score = classifier.fit(X_train, y_train).decision_function(X_test)
        
        roc_score=micro_roc_auc(y_test,y_score)
    
    return roc_score

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
def roc_scores_rsfk(X,y):
    rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=10)
    roc_scores1,roc_scores2,roc_scores3=[],[],[]
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        #Classifiers
        clf1 = KNeighborsClassifier(4)
        clf2 = MLPClassifier(alpha=0.001, max_iter=1000)
        clf3 = SVC(kernel='linear', probability=True,max_iter=1000)
        roc_scores1.append(classification(X,y,X_train,y_train,X_test,y_test,clf1))
        roc_scores2.append(classification(X,y,X_train,y_train,X_test,y_test,clf2))
        roc_scores3.append(classification(X,y,X_train,y_train,X_test,y_test,clf3))
        roc_scores = np.array([roc_scores1,roc_scores2,roc_scores3]).T
    #return np.mean(roc_scores)
    return roc_scores

#This block of code calls all the functions and make the analisis

Mean_Roc_Scores=[]
DFs=[df1,df2,df3,df4,df5,df6,df7,df8,df9,df10]

for df in DFs:
    df = cat_to_num(df)
    X = df.drop('Class', 1).to_numpy()
    y = df['Class']
    methods=["mean","knni","wknni","natural"]
    for method in methods:
        imputer = MissingValueImputation(method = method)
        X_transformed = imputer.fit_transform(X, y)
        mean_roc_scores=np.mean(roc_scores_rsfk(X_transformed,y),axis=0)
        Mean_Roc_Scores = np.append(Mean_Roc_Scores, mean_roc_scores, axis=0)
    print(Mean_Roc_Scores)

Mean_Roc_Scores=np.around(Mean_Roc_Scores.reshape(40, 3),4)

#Creating a DF with all the information
methods_str = ["Mean", "Knn", "WKnn", "Natural"]*10
db_names=["Adults"]*4 + ["Bands"]*4 + ["Breast"]*4 + ["CRX"]*4 +["Hepatitis"]*4 + ["Horse Colic"]*4 + ["House votes"]*4 + ["Mammographic"]*4 + ["Mushroom"]*4 + ["Wisconsin"]*4 
df_ROCs = pd.DataFrame({"Inputation method / Classificator": methods_str,'KNN': Mean_Roc_Scores[:, 0],
                        'Neural Nets': Mean_Roc_Scores[:, 1],'Support Vector': Mean_Roc_Scores[:, 2],"Data Base": db_names})


print(df_ROCs)
# To print the latex table
print(df_ROCs.to_latex(index=False))

#This insctruction find inputation method with the highest ROC area that are the best inputation method
print(df_ROCs.groupby(['Data Base']).max().to_latex(index=False))
# To print the latex table
print(df_ROCs.groupby(['Data Base']).max().to_latex(index=False))

Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')
