In [None]:
# Data processing tools: pandas and numpy
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import re

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

# Visualization
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# Preprocessing
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_log_error as msle

from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve

# Data imputation
from sklearn.impute import KNNImputer
!pip install miceforest
!pip install --upgrade scipy
import miceforest as mf

# Others
import os
import time
import warnings
warnings.filterwarnings('ignore')

# Datasets

In [None]:
datasets_regression = [
    "bias_correction_temp_forecast",
    "air_quality",
    "parkinson_updrs"
]

datasets_classification = [
    "winequality_white",
    "sensor_readings_24",
]

# Datasets statistics

In [None]:
print("Regression")

for dataset in datasets_regression:
    df = pd.read_csv("Datasets/Regression/"+dataset+".csv")
    print("\nName:", dataset)
    print("Shape:",df.shape)
    print("Target: from ",round(df.Target.min(),2)," to ", round(df.Target.max(),2))
#     print("Types:",dict(df.dtypes.value_counts()))    

print()
print()
print("Classification")

for dataset in datasets_classification:
    df = pd.read_csv("Datasets/Classification/"+dataset+".csv")
    print("\nName:", dataset)
    print("Shape:",df.shape)
    string = ""
    for i, n in enumerate(list(df.Target.value_counts())):
        string += str(n)+" elements, "
        if i == 2:
            string = string[:-2] + "\n"
    string = string[:-2]
    print("Target: ",string)
#     print("Types:",dict(df.dtypes.value_counts()))

# Read dataset

In [None]:
def read_dataset(regression, dataset_number, directory = "Datasets/"):

    if regression:
        datasets = datasets_regression
        directory += "Regression/"
    else:
        datasets = datasets_classification
        directory += "Classification/"

    dataset = datasets[dataset_number]

    df = pd.read_csv(directory+dataset+".csv")
    
    df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    
    return df

# Add noise

In [None]:
def run_with_p(p):
    return np.random.uniform() < p

In [None]:
def add_noise(df, noise_mode, noise_level):
    noised = df.copy()

    if noise_mode == 1: # AWGN nosie
        SNR = noise_level
        SNR_times = 10 ** (SNR / 10.)

        continuous_features =  list(df.columns[df.dtypes != "object"])
        signal_powers = (abs(df[continuous_features]) ** 2).mean()

        for feature in continuous_features:
            if feature != 'Target':
                signal_power = signal_powers[feature]
                noise_power = signal_power / SNR_times
                noise = np.random.normal(0,1,df.shape[0])
                noise *= np.sqrt(noise_power)
                noised[feature] += noise
    elif noise_mode == 2: # replacement with some probabillity
        changing_probability = noise_level

        for i in range(df.shape[1]):
            if df.columns[i] != 'Target':
                for j in range(df.shape[0]):
                    if run_with_p(changing_probability):
                            if df.dtypes[i] == 'object':
                                noised.iloc[j, i] = np.random.choice(df.iloc[:,i].unique())
                            else:
                                lim = (df.iloc[:,i].min(), df.iloc[:,i].max())
                                noised.iloc[j, i] = np.array(np.random.uniform(*lim), dtype=df.dtypes[i])

    return noised

# Drop values

In [None]:
def drop_values(df, dropping_mode, dropping_probability):
    with_drops = df.copy()
    
    if dropping_mode == 1: # MCAR
        for i in range(df.shape[1]):
            if df.columns[i] != 'Target':
                for j in range(df.shape[0]):
                    if run_with_p(dropping_probability):
                        with_drops.iloc[j, i] = None
    elif dropping_mode == 2: # MAR
        Aj = with_drops.copy()
        Aj.drop(columns=['Target'], inplace=True)
        is_odd = np.mod(Aj.shape[1],2)

        jfeat = int(np.ceil(Aj.shape[1]/2))
        jcols = Aj.columns
        new_jcols = np.random.choice(jcols, size = jfeat, replace = False, p = None)
        As = Aj.drop(new_jcols, axis = 1)
        Aj = Aj[new_jcols]

        for col in range(Aj.shape[1]):
            if is_odd:
                if col >= As.shape[1]-1:
                    As_index = As.shape[1]-1
                    actual_p = 3*dropping_probability
                else:
                    As_index = col
                    actual_p = 4*dropping_probability
            else:
                As_index = col
                actual_p = 4*dropping_probability

            if As.dtypes[As_index] != 'object':
                med = As.iloc[:,As_index].median()

                if np.random.randint(2):
                    for row in range(Aj.shape[0]):
                        if As.iloc[row, As_index] < med:
                            if run_with_p(actual_p):
                                with_drops.loc[row, new_jcols[col]]=None
                else:
                    for row in range(Aj.shape[0]):
                        if As.iloc[row, As_index] > med:
                            if run_with_p(actual_p):
                                with_drops.loc[row, new_jcols[col]]=None
            else:
                classes=As.iloc[:,As_index].unique()
                subset_size = int(np.floor(len(classes)/2))
                subset=np.random.choice(classes, size = subset_size, replace = False, p = None)

                for row in range(Aj.shape[0]):
                    if As.iloc[row, As_index] in subset:
                        if run_with_p(actual_p):
                            with_drops.loc[row, new_jcols[col]]=None
    elif dropping_mode == 3: # NMAR
        Aj = with_drops.copy()
        Aj.drop(columns=['Target'], inplace=True)
        jcols = Aj.columns
        
        for col in range(Aj.shape[1]):
            if Aj.dtypes[col] != 'object':
                med = Aj.iloc[:,col].median()
                if np.random.randint(2):
                    for row in range(Aj.shape[0]):
                        if Aj.iloc[row, col] < med:
                            if run_with_p(2*dropping_probability):
                                with_drops.loc[row, jcols[col]]=None
                else:
                    for row in range(Aj.shape[0]):
                        if Aj.iloc[row, col] > med:
                            if run_with_p(2*dropping_probability):
                                with_drops.loc[row, jcols[col]]=None
            else:
                classes=Aj.iloc[:,col].unique()
                subset_size = int(np.floor(len(classes)/2))
                subset=np.random.choice(classes, size = subset_size, replace = False, p = None)
                for row in range(Aj.shape[0]):
                    if Aj.iloc[row, col] in subset:
                        if run_with_p(2*dropping_probability):
                            with_drops.loc[row, jcols[col]]=None
                        
    return with_drops

# Impute data

In [None]:
def impute_data(df, imputation_mode):    
    if imputation_mode == 0: # dropping
        filled = df.dropna()
    elif imputation_mode == 1: # filling with 0
        filled = df.fillna(0)
    elif imputation_mode == 2: # filling with mean
        filled = df.copy()

        categorical_features = list(df.columns[df.dtypes == "object"])
        continuous_features =  list(df.columns[df.dtypes != "object"])

        for feature in categorical_features:
            filled[feature].fillna(filled[feature].value_counts().index[0], inplace=True)
        for feature in continuous_features:
            filled[feature].fillna(filled[feature].mean(), inplace=True)
    elif imputation_mode == 3: # filling with median
        filled = df.copy()

        categorical_features = list(df.columns[df.dtypes == "object"])
        continuous_features =  list(df.columns[df.dtypes != "object"])

        for feature in categorical_features:
            filled[feature].fillna(filled[feature].value_counts().index[0], inplace=True)
        for feature in continuous_features:
            filled[feature].fillna(filled[feature].median(), inplace=True)
    elif imputation_mode == 4: # filling by MICE
        y = df['Target']
        X = df.drop('Target', axis = 1)

        kds = mf.ImputationKernel(
          X,
          datasets=1,
          save_all_iterations=False,
          random_state=random_state
        )

        kds.mice(5)

        X_filled = kds.complete_data(dataset=0, inplace=False)

        filled = pd.concat([X_filled,y], axis=1)
    elif imputation_mode == 5: # filling by KNN
        y = df['Target']
        X = df.drop('Target', axis = 1)

        imputer = KNNImputer(n_neighbors=5)
        X_filled = imputer.fit_transform(X)

        for col in range(X.shape[1]):
            X.iloc[:,col] = X_filled[:,col]

        filled = pd.concat([X,y], axis=1)
        
    return filled

# Models

In [None]:
def get_models_and_params(regression, random_state):
    # Models used
    if regression:
        models = [
            LinearRegression(n_jobs=-1),
            DecisionTreeRegressor(random_state = random_state),
            RandomForestRegressor(random_state = random_state, n_jobs = -1),
            LGBMRegressor(random_state = random_state, n_jobs = -1),
        ]
    else:
        models = [
            LogisticRegression(n_jobs=-1),
            DecisionTreeClassifier(random_state = random_state),
            RandomForestClassifier(random_state = random_state, n_jobs = -1),
            LGBMClassifier(random_state = random_state, n_jobs = -1),
        ]

    # Parameters grid
    grid_search_parameters = {
        'n_estimators': list(range(10,100,10)),
        'max_depth': [3, 5, 7],
        'C': np.logspace(-3,3,7),
        'kernel': ['poly', 'rbf'], 
        }
    
    return models, grid_search_parameters

In [None]:
def get_X_y(df, regression):
    if regression:
        y = df['Target']
    else:
        encoder = preprocessing.LabelEncoder()
        y = encoder.fit_transform(df['Target'])

    X = pd.get_dummies(df.drop('Target', axis = 1), drop_first = False)

    return X, y

In [None]:
def getkeys(dict): 
    return [*dict]

In [None]:
def tune_hyperparams(X, y, models, grid_search_parameters, scoring, verbose=False):
    best_models = []
    val_scores = []
        
    for model in models:
        # Intersection of keys for GridSearchCV
        intersection_keys = model.get_params().keys() & grid_search_parameters.keys()
        parameters_grid = {}
        for key in intersection_keys:
            parameters_grid[key] = grid_search_parameters[key]

        # Create pipeline
        pln = make_pipeline(StandardScaler(), model)

        # Update parameter names
        pipeline_params_grid = {}
        for i in range(len(parameters_grid)):
            pipeline_params_grid[str(pln.steps[1][0]) + '__' + str(getkeys(parameters_grid)[i])] = \
                parameters_grid[str(getkeys(parameters_grid)[i])]    

        # Cross-validation
        gcv = GridSearchCV(pln, pipeline_params_grid, scoring = scoring, cv = 3,
                           n_jobs = -1, refit = True)
        gcv.fit(X, y)

        if verbose:
            print(model.__class__.__name__, ' best with ', gcv.best_params_)
        
        best_models.append(gcv.best_estimator_)
        val_scores.append(abs(cross_val_score(gcv.best_estimator_, X, y, scoring = scoring, cv=3).mean()))

    return best_models, val_scores

In [None]:
def eval_dataset(X, y, models, scoring):    
    val_scores = []
    
    for model in models:
        val_scores.append(abs(cross_val_score(model, X, y, scoring = scoring, cv=3).mean()))

    return val_scores

# Parameters description
#### Noise mode
- 0 - no noise
- 1 - AWGN noise
- 2 - replacements

#### Dropping mode
- 0 - no dropping
- 1 - MCAR
- 2 - MAR
- 3 - NMAR

#### Imputation mode
- 0 - dropping
- 1 - filling with 0
- 2 - filling with mean
- 3 - filling with median
- 4 - filling by MICE
- 5 - filling by kNN

In [None]:
directory_with_results = "Results"

# Create directory
if not os.path.exists(directory_with_results):
    os.mkdir(directory_with_results)

# Experiments with noise only

In [None]:
# For resulting csv
random_states = []
dataset_numbers = []
regressions = []

noise_modes = []
noise_levels = []

val_scores = [[],[],[],[]]
initial_val_scores = [[],[],[],[]]

for random_state in range(3):
    
    np.random.seed(random_state)
    start = time.time()
    print("random state", random_state)

    for regression in [True, False]:
        if regression:
            scoring = 'neg_mean_absolute_percentage_error'
        else:
            scoring = 'f1_micro'
            
        models, grid_search_parameters = get_models_and_params(regression, random_state)
                                
        for dataset_number in [0,1,2]:
            
            if (dataset_number == 2) and (regression == False):
                continue
            
            df = read_dataset(regression, dataset_number)
            print(int(time.time() - start), "seconds", "read dataset", dataset_number, \
                  "regression", regression)

            X, y = get_X_y(df, regression)

            best_models, initial_val_scores_per_run = tune_hyperparams(X, y, models, grid_search_parameters, \
                                                             scoring)
            
            for array_of_noise_levels, noise_mode in [(np.linspace(0, 0.5, 21), 2), (range(-20, 21, 2), 1)]:
                for noise_level in array_of_noise_levels:
                    
                    noised = add_noise(df, noise_mode, noise_level)

                    X, y = get_X_y(noised, regression)

                    val_scores_per_run = eval_dataset(X, y, best_models, scoring)
                    
                    random_states.append(random_state)
                    dataset_numbers.append(dataset_number)
                    regressions.append(regression)

                    noise_modes.append(noise_mode)
                    noise_levels.append(noise_level)

                    for i in range(len(models)):
                        initial_val_scores[i].append(initial_val_scores_per_run[i])
                        val_scores[i].append(val_scores_per_run[i])

In [None]:
results = pd.DataFrame({
           'Random State': random_states,
           'Dataset number': dataset_numbers,
           'Is regression': regressions,
    
           'Noise mode': noise_modes,
           'Noise level': noise_levels,

           'Val score 0': val_scores[0],
           'Val score 1': val_scores[1],
           'Val score 2': val_scores[2],
           'Val score 3': val_scores[3],
    
           'Initial val score 0': initial_val_scores[0],
           'Initial val score 1': initial_val_scores[1],
           'Initial val score 2': initial_val_scores[2],
           'Initial val score 3': initial_val_scores[3],
           })

results.to_csv(directory_with_results+"/results_noise_only.csv", index=False)
results.head()

# Experiments with missing values only

In [None]:
# For resulting csv
random_states = []
dataset_numbers = []
regressions = []

dropping_modes = []
dropping_level = []
imputation_modes = []

val_scores = [[],[],[],[]]
initial_val_scores = [[],[],[],[]]

start = time.time()

for random_state in range(3):
    
    np.random.seed(random_state)
    print("random state", random_state)

    for regression in [True, False]:
        if regression:
            scoring = 'neg_mean_absolute_percentage_error'
        else:
            scoring = 'f1_micro'
            
        models, grid_search_parameters = get_models_and_params(regression, random_state)
                       
        for dataset_number in [0,1,2]:
            
            if (dataset_number == 2) and (regression == False):
                continue
            
            df = read_dataset(regression, dataset_number)
            print(int(time.time() - start), "seconds", "read dataset", dataset_number, "regression", regression)

            X, y = get_X_y(df, regression)

            best_models, initial_val_scores_per_run = tune_hyperparams(X, y, models, grid_search_parameters, \
                                                             scoring)
            
            for dropping_mode in [1,2,3]:
                for dropping_probability in [0.1, 0.15, 0.2, 0.25, 0.3]:

                    with_drops = drop_values(df, dropping_mode, dropping_probability)
                
                    for imputation_mode in [1,2,3,4,5]:
                        imputed = impute_data(with_drops, imputation_mode)

                        X, y = get_X_y(imputed, regression)

                        val_scores_per_run = eval_dataset(X, y, best_models, scoring)
                    
                        random_states.append(random_state)
                        dataset_numbers.append(dataset_number)
                        regressions.append(regression)

                        dropping_modes.append(dropping_mode)
                        dropping_level.append(dropping_probability)
                        imputation_modes.append(imputation_mode)

                        for i in range(len(models)):
                            initial_val_scores[i].append(initial_val_scores_per_run[i])
                            val_scores[i].append(val_scores_per_run[i])

In [None]:
results = pd.DataFrame({
           'Random State': random_states,
           'Dataset number': dataset_numbers,
           'Is regression': regressions,
    
           'Drop mode': dropping_modes,
           'Drop level': dropping_level,
           'Imputation mode' : imputation_modes,

           'Val score 0': val_scores[0],
           'Val score 1': val_scores[1],
           'Val score 2': val_scores[2],
           'Val score 3': val_scores[3],
    
           'Initial val score 0': initial_val_scores[0],
           'Initial val score 1': initial_val_scores[1],
           'Initial val score 2': initial_val_scores[2],
           'Initial val score 3': initial_val_scores[3],
           })

results.to_csv(directory_with_results+"/results_drop_only.csv", index=False)
results.head()

# Experiments with missing values and noise

In [None]:
# For resulting csv
random_states = []
dataset_numbers = []
regressions = []

noise_modes = []
noise_levels = []

dropping_modes = []
dropping_level = []
imputation_modes = []

val_scores = [[],[],[],[]]
initial_val_scores = [[],[],[],[]]

for random_state in range(3):
    
    np.random.seed(random_state)
    start = time.time()
    print("random state", random_state)

    for regression in [True, False]:
        if regression:
            scoring = 'neg_mean_absolute_percentage_error'
        else:
            scoring = 'f1_micro'
            
        models, grid_search_parameters = get_models_and_params(regression, random_state)
                                
        for dataset_number in [0,1,2]:
            
            if (dataset_number == 2) and (regression == False):
                continue
            
            df = read_dataset(regression, dataset_number)
            print(int(time.time() - start), "seconds", "read dataset", dataset_number, \
                  "regression", regression)

            X, y = get_X_y(df, regression)

            best_models, initial_val_scores_per_run = tune_hyperparams(X, y, models, grid_search_parameters, \
                                                             scoring)
            
            for array_of_noise_levels, noise_mode in [([0.1, 0.2, 0.3],2), ([-6, 0, 10], 1)]:
                for noise_level in array_of_noise_levels:
                    noised = add_noise(df, noise_mode, noise_level)
                
                    for dropping_mode in [1,2,3]:
                        for dropping_probability in [0.1, 0.2, 0.3]:

                            with_drops = drop_values(noised, dropping_mode, dropping_probability)

                            for imputation_mode in [1,2,3,4,5]:
                                imputed = impute_data(with_drops, imputation_mode)

                                X, y = get_X_y(imputed, regression)

                                val_scores_per_run = eval_dataset(X, y, best_models, scoring)
                    
                                random_states.append(random_state)
                                dataset_numbers.append(dataset_number)
                                regressions.append(regression)
                                
                                noise_modes.append(noise_mode)
                                noise_levels.append(noise_level)

                                dropping_modes.append(dropping_mode)
                                dropping_level.append(dropping_probability)
                                imputation_modes.append(imputation_mode)

                                for i in range(len(models)):
                                    initial_val_scores[i].append(initial_val_scores_per_run[i])
                                    val_scores[i].append(val_scores_per_run[i])

In [None]:
results = pd.DataFrame({
           'Random State': random_states,
           'Dataset number': dataset_numbers,
           'Is regression': regressions,
    
           'Noise mode': noise_modes,
           'Noise level': noise_levels,
    
           'Drop mode': dropping_modes,
           'Drop level': dropping_level,
           'Imputation mode' : imputation_modes,

           'Val score 0': val_scores[0],
           'Val score 1': val_scores[1],
           'Val score 2': val_scores[2],
           'Val score 3': val_scores[3],
    
           'Initial val score 0': initial_val_scores[0],
           'Initial val score 1': initial_val_scores[1],
           'Initial val score 2': initial_val_scores[2],
           'Initial val score 3': initial_val_scores[3],
           })

results.to_csv(directory_with_results+"/results_noise_and_drop.csv", index=False)
results.head()