## step1. import all required modules and packages

In [2]:
from sklearn.impute import KNNImputer
import numpy as np
import pandas as pd
import csv
import random
import matplotlib.pyplot as plt
from scipy.spatial import distance
from sklearn.metrics.pairwise import nan_euclidean_distances
# dist(x,y) = sqrt(weight * sq. distance from present coordinates) where, weight = Total # of coordinates / # of present coordinates
# ref : https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.nan_euclidean_distances.html

##configure precision for floating numbers
np.set_printoptions(precision=3)

### define all functions section

In [3]:
def dominate(v1, v2):
    
    dom = False
    for d in range(len(v1)):        
        if(d == 1):
            if(v1[d] <= v2[d]):
                dom = True
            else:
                dom = False
                break
        elif(d == 0 or d == 2 or d == 3):
            if(v1[d] >= v2[d]):
                dom = True
            else:
                dom = False
                break
    
    return dom
    

def findskyline(array):
    rows, cols = array.shape
#     print("row of array = ", rows)
#     print("col of array = ", cols)
    
    skylineList = [0]
    candidate = []
    for row in range(rows):
#         print("iteration : ", row)
        new = array[row]
        length = len(skylineList)
        
        for index in range(length):
            temp = skylineList.pop(0)
#             print("pop out element : "+str(temp))
            pop = array[temp]
            
            if(not(dominate(new, pop)) and not(dominate(pop, new))):
#                 print(str(new)+ " and "+ str(pop) + " don't dominate each other.")
                if(index == (length-1) ):
#                     print("last index:" + str(index))
#                     print("insert " + str(row) + " into candidate." )    
                    if(row not in candidate):
                        candidate.append(row)
                
#                 print("insert " + str(temp) + " into candidate." )    
                if(temp not in candidate):
                    candidate.append(temp)
            
            elif(dominate(new, pop)):
#                 print(str(new) + " dominate " + str(pop))
#                 print("insert " + str(row) + " into candidate." )
                if(row not in candidate):
                    candidate.append(row)
            
            elif(dominate(pop, new)):
#                 print(str(pop) + " dominate " + str(new))
#                 print("insert " + str(temp) + " into candidate." )
                if(temp not in candidate):
                    candidate.append(temp)
            
#             print("candidate : "+str(candidate))
        
        skylineList = candidate[:]
        candidate.clear()
            
#         print("current skylineList:"+str(skylineList))

    return skylineList

def eval_missing_weight(dataset):
    
    data = dataset.copy()
    rows, cols = dataset.shape
    
    #initialize a weight list with length same with number of columns of input dataset
    missing_weight_by_each_col = [None]*cols 
    miss_count = [None]*cols
    miss_counter_per_col = 0
    
    for col in range(cols):
        for row in range(rows):
            if np.isnan(data[row][col]):
                miss_counter_per_col += 1
            else:
                continue
        
        miss_count[col] = miss_counter_per_col #record missing count for each column
        missing_weight_by_each_col[col] = miss_counter_per_col/rows #record missing ratio for each column
        miss_counter_per_col = 0 # reset the missing-counter into next iteration
    
    print(miss_count)
    print(missing_weight_by_each_col)
    return miss_count, missing_weight_by_each_col

## step2. prepare for input data with some missing values

In [4]:
'''
    Incomplete data set
'''
data  = np.array([
    [np.nan,      5,      3, np.nan,      6],
    [2     , np.nan,      6, np.nan, np.nan],
    [8     ,      9, np.nan, np.nan, np.nan],
    [np.nan,      8,      6,      7, np.nan],
    [5     ,      6, np.nan, np.nan, np.nan],
    [9     ,      5, np.nan,      7, np.nan],
    ])

miss_count, miss_weight = eval_missing_weight (data)
data_row, data_col = data.shape

original_skyline = findskyline(data) 
print(original_skyline)

[2, 1, 3, 4, 5]
[0.3333333333333333, 0.16666666666666666, 0.5, 0.6666666666666666, 0.8333333333333334]
[1, 2, 3, 4, 5, 0]


## step3. evaluate the distance_matrix and weight_matrix

In [8]:
'''
    distance_matrix
'''
distance_matrix = nan_euclidean_distances(data,data)
dis_row, dis_col = distance_matrix.shape 
''' there are two cases of distance_matrix = 0
    1. two samples have same value but non-NaN with its all attributes
    2. two samples have no any corresponding attributes both are non-NaN
'''
print(distance_matrix)

'''
    weight_matrix
'''
# initialize weight_matrix of same size with distance_matrix
weight_matrix = [ [ None for y in range( dis_col ) ] for x in range( dis_row ) ]

# print(weight_matrix)

# assign values according to value in distance_matrix
for row in range(dis_row):
    for col in range(dis_col):
        if (row == col or distance_matrix[row][col] == 0): 
            # assign zero otherwise (index i = j)
            '''
                there are two cases causing weight_matrix = 0
                1. weight_matrix row index = column index
                2. distance_matrix value is zero, which means neigther corresponding attributes of two samples are non-NaN
            '''
            weight_matrix[row][col] = 0
        elif (row != col and distance_matrix[row][col] != 0) :
            # assign inverse value of nan_euclidean_distances w.r.t distance_matrix
            # python will alert divide-by-zero warning here
            weight_matrix[row][col] = 1. / distance_matrix[row][col]

# print(weight_matrix)
weight_matrix

[[ 0.     6.708  8.944  6.708  2.236  0.   ]
 [ 6.708  0.    13.416  0.     6.708 15.652]
 [ 8.944 13.416  0.     2.236  6.708  6.519]
 [ 6.708  0.     2.236  0.     4.472  4.743]
 [ 2.236  6.708  6.708  4.472  0.     6.519]
 [ 0.    15.652  6.519  4.743  6.519  0.   ]]


[[0,
  0.14907119849998599,
  0.11180339887498948,
  0.14907119849998599,
  0.4472135954999579,
  0],
 [0.14907119849998599,
  0,
  0.07453559924999299,
  0,
  0.14907119849998599,
  0.06388765649999399],
 [0.11180339887498948,
  0.07453559924999299,
  0,
  0.4472135954999579,
  0.14907119849998599,
  0.15339299776947407],
 [0.14907119849998599,
  0,
  0.4472135954999579,
  0,
  0.22360679774997896,
  0.21081851067789195],
 [0.4472135954999579,
  0.14907119849998599,
  0.14907119849998599,
  0.22360679774997896,
  0,
  0.15339299776947407],
 [0,
  0.06388765649999399,
  0.15339299776947407,
  0.21081851067789195,
  0.15339299776947407,
  0]]

hello world
