In [1]:
import numpy as np
np.set_printoptions(precision=2, suppress=True)

In [2]:
x = np.array([5, 27, 100, 59, 28, 48, 50, 39, 9, 7, 20, 63, 10, 41, 9])

min-max normalisation:
$$
\hat{x} = \frac{x - x_\text{min}}{x_\text{max} - x_\text{min}}
$$

In [3]:
(x - np.min(x)) / (np.max(x) - np.min(x))

array([0.  , 0.23, 1.  , 0.57, 0.24, 0.45, 0.47, 0.36, 0.04, 0.02, 0.16,
       0.61, 0.05, 0.38, 0.04])

z-score normalisation:
$$
\hat{x} = \frac{x - \overline{x}}{\sigma}
$$

In [4]:
(x - np.mean(x)) / np.std(x)

array([-1.13, -0.28,  2.54,  0.95, -0.25,  0.53,  0.61,  0.18, -0.98,
       -1.06, -0.55,  1.11, -0.94,  0.26, -0.98])

robust normalisation, suppose function $med(x)$ computes the median of vector $x$:
$$
\hat{x} = \frac{x - med(x)}{med(|x - med(x)|)}
$$

In [5]:
f = np.median # store the function
(x - f(x)) / f(np.abs(x - f(x)))

array([-1.21, -0.05,  3.79,  1.63,  0.  ,  1.05,  1.16,  0.58, -1.  ,
       -1.11, -0.42,  1.84, -0.95,  0.68, -1.  ])

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('..\Poll_Result_raw.csv')

In [8]:
df

Unnamed: 0,SRCID,Q1,Q1_Ordinal,Q2,DUM1,DUM2,Q5Average,Q6,Q7Average,Q8,...,Q18,Q20,Q22,Q22_Ordinal,Q24,p_gender_sdc,p_age_group_sdc,p_education_sdc,p_state_sdc,Random
0,3,4,"""4""","""8""","""1""","""2""",3,"""5""",2,"""9""",...,8,7,1,"""1""","""0""","""1""","""1""","""2""","""2""",
1,6,1,"""1""","""11""","""1""","""2""",2,"""1""",1,"""1""",...,6,6,1,"""1""","""0""","""2""","""4""","""1""","""1""",
2,10,4,"""4""","""12""","""2""","""1""",2,"""2""",1,"""3""",...,5,7,1,"""1""","""0""","""2""","""4""","""2""","""5""",0.0
3,14,4,"""4""","""20""","""1""","""1""",3,"""4""",1,"""6""",...,5,6,1,"""1""","""0""","""2""","""4""","""3""","""2""",
4,15,4,"""4""","""17""","""1""","""2""",3,"""8""",2,"""1""",...,7,6,1,"""1""","""0""","""2""","""4""","""2""","""1""",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2049,403,4,"""4""","""19""","""1""","""3""",2,"""4""",1,"""9""",...,8,9,3,"""3""","""1""","""1""","""4""","""1""","""5""",
2050,840,2,"""2""","""34""","""2""","""2""",2,"""4""",2,"""2""",...,2,10,3,"""3""","""1""","""1""","""4""","""1""","""3""",
2051,946,2,"""2""","""37""","""1""","""3""",2,"""10""",1,"""3""",...,7,3,3,"""3""","""1""","""2""","""3""","""1""","""2""",
2052,1227,2,"""2""","""34""","""1""","""3""",2,"""12""",2,"""5""",...,6,5,3,"""3""","""1""","""1""","""3""","""1""","""1""",1.0


In [9]:
len(df) - df.count() # count missing value for each attribute

SRCID                0
Q1                   0
Q1_Ordinal           0
Q2                   0
DUM1                 0
DUM2                 0
Q5Average            0
Q6                   0
Q7Average            0
Q8                   0
Q9Average            0
Q10                  0
Q11Average           0
Q12                  0
Q12_Ordinal          0
Q18                  0
Q20                  0
Q22                  0
Q22_Ordinal          0
Q24                  0
p_gender_sdc         0
p_age_group_sdc      0
p_education_sdc      0
p_state_sdc          0
Random             462
dtype: int64

In [10]:
median_val = df['Random'].median()

In [11]:
median_val

421.54770640000004

In [12]:
df['Random'].fillna(median_val)

0       421.547706
1       421.547706
2         0.000000
3       421.547706
4         0.000000
           ...    
2049    421.547706
2050    421.547706
2051    421.547706
2052      1.000000
2053    421.547706
Name: Random, Length: 2054, dtype: float64

In [13]:
len(df) - df.count() # it does not cover the orginal data

SRCID                0
Q1                   0
Q1_Ordinal           0
Q2                   0
DUM1                 0
DUM2                 0
Q5Average            0
Q6                   0
Q7Average            0
Q8                   0
Q9Average            0
Q10                  0
Q11Average           0
Q12                  0
Q12_Ordinal          0
Q18                  0
Q20                  0
Q22                  0
Q22_Ordinal          0
Q24                  0
p_gender_sdc         0
p_age_group_sdc      0
p_education_sdc      0
p_state_sdc          0
Random             462
dtype: int64

In [14]:
median_rand = df['Random'].fillna(median_val)

In [15]:
df['Random'] = median_rand

In [16]:
len(df) - df.count()

SRCID              0
Q1                 0
Q1_Ordinal         0
Q2                 0
DUM1               0
DUM2               0
Q5Average          0
Q6                 0
Q7Average          0
Q8                 0
Q9Average          0
Q10                0
Q11Average         0
Q12                0
Q12_Ordinal        0
Q18                0
Q20                0
Q22                0
Q22_Ordinal        0
Q24                0
p_gender_sdc       0
p_age_group_sdc    0
p_education_sdc    0
p_state_sdc        0
Random             0
dtype: int64