In [None]:
# Data processing tools: pandas and numpy
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# Visualization
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections.polar import PolarAxes
from matplotlib.projections import register_projection
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D

# Others
import os
import time
import warnings
warnings.filterwarnings('ignore')

# Parameters description
#### Noise mode
- 0 - no noise
- 1 - AWGN noise
- 2 - replacements

#### Dropping mode
- 0 - no dropping
- 1 - MCAR
- 2 - MAR
- 3 - NMAR

#### Imputation mode
- 0 - dropping
- 1 - filling with 0
- 2 - filling with mean
- 3 - filling with median
- 4 - filling by MICE
- 5 - filling by kNN

In [None]:
directory_with_results = "Results"
directory_with_graphs = "Graphs"

# Create directory
if not os.path.exists(directory_with_graphs):
    os.mkdir(directory_with_graphs)

In [None]:
def get_model_name(model_number):
    if model_number == 0:
        return "Linear"
    if model_number == 1:
        return "DT"
    if model_number == 2:
        return "RF"
    if model_number == 3:
        return "GB"
    
def get_imputation_name(imputation_mode):
    if imputation_mode == 1:
        return "Filling with 0"
    if imputation_mode == 2:
        return "Filling with mean"
    if imputation_mode == 3:
        return "Filling with median"
    if imputation_mode == 4:
        return "Filling by MICE"
    if imputation_mode == 5:
        return "Filling by KNN"
    
def get_drop_name(drop_mode):
    if drop_mode == 1:
        return "MCAR"
    if drop_mode == 2:
        return "MAR"
    if drop_mode == 3:
        return "NMAR"

In [None]:
def diag_radar(num_vars, frame = 'circle'):
    # calculate evenly-spaced axis angles
    theta = np.linspace(0, 2*np.pi, num_vars, endpoint = False)
    
    class RadarTransform(PolarAxes.PolarTransform):
        def transform_path_non_affine(self, path):
            if path._interpolation_steps > 1:
                path = path.interpolated(num_vars)
            return Path(self.transform(path.vertices), path.codes)

    class AxesRadar(PolarAxes):

        name = 'radar'
        
        PolarTransform = RadarTransform

        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            # rotate plot such that the first axis is at the top
            self.set_theta_zero_location('N')

        def fill(self, *args, closed = True, **kwargs):
            return super().fill(closed = closed, *args, **kwargs)

        def plot(self, *args, **kwargs):
            lines = super().plot(*args, **kwargs)
            for line in lines:
                self._close_line(line)

        def _close_line(self, line):
            x, y = line.get_data()
            if x[0] != x[-1]:
                x = np.concatenate((x, [x[0]]))
                y = np.concatenate((y, [y[0]]))
                line.set_data(x, y)

        def set_varlabels(self, labels):
            self.set_thetagrids(np.degrees(theta), labels)

        def _gen_axes_patch(self):
            # The Axes patch must be centered at (0.5, 0.5) and of radius 0.5
            # in axes coordinates.
            if frame == 'circle':
                return Circle((0.5, 0.5), 0.5)
            elif frame == 'polygon':
                return RegularPolygon((0.5, 0.5), num_vars,
                                      radius=.5, edgecolor="k")
            else:
                raise ValueError("unknwn val 'frame': %s" % frame)

        def draw(self, renderer):
            """ Draw. If frame is polygon, make gridlines polygon-shaped """
            if frame == 'polygon':
                gridlines = self.yaxis.get_gridlines()
                for gl in gridlines:
                    gl.get_path()._interpolation_steps = num_vars
            super().draw(renderer)


        def _gen_axes_spines(self):
            if frame == 'circle':
                return super()._gen_axes_spines()
            elif frame == 'polygon':
                # spine_type must be 'left'/'right'/'top'/'bottom'/'circle'.
                spine = Spine(axes = self,
                              spine_type = 'circle',
                              path = Path.unit_regular_polygon(num_vars))
  
                spine.set_transform(Affine2D().scale(.5).translate(.5, .5)
                                    + self.transAxes)
                return {'polar': spine}
            else:
                raise ValueError("unknown value for 'frame': %s" % frame)

    register_projection(AxesRadar)
    return theta

def make_diagram(inputs, titles, fig_name):
    data = [titles, ('', [])]
    for key in inputs.keys():
        data[1][1].append(inputs[key])
    N = len(data[0])
    theta = diag_radar(N, 'polygon')

    vertices_labels = data.pop(0)
    title, case_data = data[0]

    fig, ax = plt.subplots(figsize = (7, 7), subplot_kw = dict(projection = 'radar'))
    fig.subplots_adjust(top = 0.9, bottom = 0.1)
    
    ax.set_title(title,  position = (0.5, 1.1), ha = 'center')

    #For rgrids
    keys = inputs.keys()
    min_list = []
    max_list = []
    for key in keys:
        min_list.append(min(inputs[key]))
        max_list.append(max(inputs[key]))
    a = min(min_list)    
    b = max(max_list)
    x1 = np.round(np.linspace(a, b, 5), 3)
    ax.set_rgrids(x1)
    
    for key, d in zip(inputs.keys(), case_data):
        line = ax.plot(theta, d, label = key)
        ax.fill(theta, d, alpha = 0.2)
        ax.set_varlabels(vertices_labels)

    plt.legend(loc ='lower right')
    fig.savefig(fig_name, dpi = 200)
    plt.clf()

# Experiments with noise only

In [None]:
results = pd.read_csv(directory_with_results+"/results_noise_only.csv")

for dataset_number in results['Dataset number'].unique():
    for regression in results['Is regression'].unique():
        for noise_mode in results['Noise mode'].unique():
            df = results.copy()
            df = df[(df['Dataset number'] == dataset_number) & (df['Is regression'] == regression) &\
                   (df['Noise mode'] == noise_mode)]
            
            if df.shape[0] == 0:
                continue
            
            for model_number in range(4):
                val_score_name = "Val score "+str(model_number)
                initial_val_score_name = "Initial val score "+str(model_number)
                label = get_model_name(model_number)

                df_y=pd.DataFrame(df['Noise level'])

                if regression:
                    df_y["y"] = df[initial_val_score_name] / df[val_score_name]
                else:
                    df_y["y"] = df[val_score_name] / df[initial_val_score_name]

                df_y=df_y.groupby("Noise level").agg(('mean','std'))

                y=list(df_y.y['mean'])
                y_err=list(df_y.y['std'])
                x = list(df_y.index)

                plt.errorbar(x, y, yerr=y_err, label=label)

            if regression:
                plt.ylabel("MAPE on initial / MAPE on distorted")
            else:
                plt.ylabel("F1 on distorted / F1 on initial")

            if noise_mode == 1:
                plt.xlabel("SNR, dB")
                name = "_SNR"
                plt.gca().invert_xaxis()
            else:
                name = "_p"
                plt.xlabel("Changing probability")

            plt.legend(loc='best')
            plt.grid()

            fig_name = 'noise_reg_' + str(int(regression)) + \
                    '_dataset_' + str(dataset_number) + name + '.png'

            plt.savefig(directory_with_graphs+"/"+fig_name, transparent=True)
            plt.clf();

# Experiments with missing values only

In [None]:
results = pd.read_csv(directory_with_results+"/results_drop_only.csv")

for dataset_number in results['Dataset number'].unique():
    for regression in results['Is regression'].unique():
        for dropping_mode in results['Drop mode'].unique():
            for model_number in range(4):
                df = results.copy()

                df = df[(df['Dataset number'] == dataset_number) & (df['Is regression'] == regression) &\
                       (df['Drop mode'] == dropping_mode)]
                
                if df.shape[0] == 0:
                    continue
                
                val_score_name = "Val score "+str(model_number)
                initial_val_score_name = "Initial val score "+str(model_number)

                for imputation_mode in df['Imputation mode'].unique():
                    label = get_imputation_name(imputation_mode)

                    sub_df = df[df['Imputation mode'] == imputation_mode]

                    df_y=pd.DataFrame(sub_df['Drop level'])

                    if regression:
                        df_y["y"] = sub_df[initial_val_score_name] / sub_df[val_score_name]
                    else:
                        df_y["y"] = sub_df[val_score_name] / sub_df[initial_val_score_name]

                    df_y=df_y.groupby("Drop level").agg(('mean','std'))

                    y=list(df_y.y['mean'])
                    y_err=list(df_y.y['std'])
                    x = list(df_y.index)

                    plt.errorbar(x, y, yerr=y_err, label=label)

                if regression:
                    plt.ylabel("MAPE on initial / MAPE on distorted")
                else:
                    plt.ylabel("F1 on distorted / F1 on initial")

                plt.xlabel("Dropping probability")

                plt.legend(loc='best')
                plt.grid()

                fig_name = 'drop4model_reg_' + str(int(regression)) + \
                        '_dataset_' + str(dataset_number) + \
                        '_drop_'+str(dropping_mode)+'_model_'+str(model_number)+\
                        '.png'

                plt.savefig(directory_with_graphs+"/"+fig_name, transparent=True)
                plt.clf();

In [None]:
results = pd.read_csv(directory_with_results+"/results_drop_only.csv")

for dataset_number in results['Dataset number'].unique():
    for regression in results['Is regression'].unique():
        df = results.copy()

        df = df[(df['Dataset number'] == dataset_number) & (df['Is regression'] == regression)]
        
        if df.shape[0] == 0:
            continue

        for model_number in range(4):
            if regression:
                metrics = df["Initial val score "+str(model_number)] / \
                    df["Val score "+str(model_number)]

            else:
                metrics = df["Val score "+str(model_number)] / \
                    df["Initial val score "+str(model_number)]

            df["Metrics "+str(model_number)] = metrics
            
        df["Median metrics"] = df.filter(regex="Metrics").median(axis=1)
        
        inputs = {}

        titles = []
        for drop_mode in [1,2,3]:
            for drop_level in [0.1, 0.2, 0.3]:
                titles.append(get_drop_name(drop_mode)+f'({drop_level})')

        for imputation_mode in df['Imputation mode'].unique():
            sub_df = df[df['Imputation mode'] == imputation_mode]

            values = []
            for drop_mode in [1,2,3]:
                for drop_level in [0.1, 0.2, 0.3]:
                    values.append(sub_df[(sub_df["Drop mode"] == drop_mode) & \
                                         (sub_df["Drop level"] == drop_level)]["Median metrics"].mean())


            inputs[get_imputation_name(imputation_mode)] = values

        fig_name = 'drop_diagram_reg_' + str(int(regression)) + \
                '_dataset_' + str(dataset_number)+'.png'

        make_diagram(inputs, titles, directory_with_graphs+"/"+fig_name)

# Experiments with missing values and noise

In [None]:
results = pd.read_csv(directory_with_results+"/results_noise_and_drop.csv")

for dataset_number in results['Dataset number'].unique():
    for regression in results['Is regression'].unique():
        for noise_mode in results['Noise mode'].unique():
                for noise_level in results['Noise level'].unique():
                    df = results.copy()

                    df = df[(df['Dataset number'] == dataset_number) & (df['Is regression'] == regression) &\
                           (df['Noise mode'] == noise_mode) & (df['Noise level'] == noise_level)]

                    if df.shape[0] == 0:
                        continue
                        
                    for model_number in range(4):
                        if regression:
                            metrics = df["Initial val score "+str(model_number)] / \
                                df["Val score "+str(model_number)]
                                    
                        else:
                            metrics = df["Val score "+str(model_number)] / \
                                df["Initial val score "+str(model_number)]
                        
                        df["Metrics "+str(model_number)] = metrics
            
                    df["Median metrics"] = df.filter(regex="Metrics").median(axis=1)

                    inputs = {}

                    titles = []
                    for drop_mode in [1,2,3]:
                        for drop_level in [0.1, 0.2, 0.3]:
                            titles.append(get_drop_name(drop_mode)+f'({drop_level})')

                    for imputation_mode in df['Imputation mode'].unique():
                        sub_df = df[df['Imputation mode'] == imputation_mode]

                        values = []
                        for drop_mode in [1,2,3]:
                            for drop_level in [0.1, 0.2, 0.3]:
                                values.append(sub_df[(sub_df["Drop mode"] == drop_mode) & \
                                                (sub_df["Drop level"] == drop_level)]["Median metrics"].mean())


                        inputs[get_imputation_name(imputation_mode)] = values

                    fig_name = 'noisy_drop_diagram_reg_' + str(int(regression)) + \
                            '_dataset_' + str(dataset_number)+ \
                            '_noise_' + str(noise_mode) + '_' + str(noise_level) + '.png'

                    make_diagram(inputs, titles, directory_with_graphs+"/"+fig_name)