# How to check and handle missing values
Sources about imputing missing values:
http://scikit-learn.org/stable/modules/preprocessing.html#imputation-of-missing-values, 
http://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/

In [136]:
import pandas as pd

In [137]:
rawdata='C:/Python34/datasets/nurmijarvi_asunnot_250316.csv'
df = pd.read_csv(rawdata)
print("raw data dataframe size:",df.shape)

raw data dataframe size: (326, 12)


In [138]:
print(df.describe())
print(df['Kaupunginosa'].value_counts())

         Huoneet          m2             Vh   Neliohinta           Rv
count  326.00000  326.000000     326.000000   326.000000   326.000000
mean     3.07362   92.549693  198117.846626  2241.496933  1991.128834
std      0.94825   42.465746   87314.850608   726.253123    17.939264
min      1.00000   21.000000   21525.000000   439.000000  1901.000000
25%      2.00000   60.000000  145000.000000  1843.250000  1979.000000
50%      3.00000   80.750000  190250.000000  2197.000000  1990.000000
75%      4.00000  117.000000  244750.000000  2657.000000  2005.750000
max      4.00000  238.000000  495000.000000  5429.000000  2016.000000
Klaukkala     158
Kirkonkylä     78
Rajamäki       42
Röykkä          8
Lepsämä         6
Nurmijärvi      6
Perttula        6
Palojoki        2
Mäntysalo       1
Alppila         1
Raala           1
Name: Kaupunginosa, dtype: int64


Check which columns have missing values

In [139]:
missValues=df.apply(lambda x: sum(x.isnull()),axis=0)
print('Missing values in columns:\n',missValues)

Missing values in columns:
 Kaupunginosa     17
Huoneet           0
Huoneisto         0
Talotiedot        0
m2                0
Vh                0
Neliohinta        0
Rv                0
Krs             138
Hissi             0
Kunto             0
Unnamed: 11     321
dtype: int64


## Replace by a known value

In [140]:
df['Kaupunginosa'].fillna('Nurmijarvi', inplace=True)

In [141]:
missValues=df.apply(lambda x: sum(x.isnull()),axis=0)
print('Missing values in columns:\n',missValues)

Missing values in columns:
 Kaupunginosa      0
Huoneet           0
Huoneisto         0
Talotiedot        0
m2                0
Vh                0
Neliohinta        0
Rv                0
Krs             138
Hissi             0
Kunto             0
Unnamed: 11     321
dtype: int64


## Drop the lines with missing values

In [142]:
rawdata='C:/Python34/datasets/nurmijarvi_asunnot_250316.csv'
df = pd.read_csv(rawdata)
print("raw data dataframe size:",df.shape)
df_drop=df.dropna()
print(df_drop.describe())
print("dataframe size when lines with missing values dropped:",df_drop.shape)

raw data dataframe size: (326, 12)
        Huoneet         m2             Vh   Neliohinta           Rv
count  5.000000   5.000000       5.000000     5.000000     5.000000
mean   2.600000  68.700000  105573.200000  1610.800000  1978.000000
std    0.894427  16.407315   27724.200461   512.872499     8.689074
min    1.000000  40.000000   82500.000000  1051.000000  1969.000000
25%    3.000000  70.000000   89000.000000  1141.000000  1973.000000
50%    3.000000  77.000000   90000.000000  1677.000000  1974.000000
75%    3.000000  78.000000  117366.000000  1935.000000  1984.000000
max    3.000000  78.500000  149000.000000  2250.000000  1990.000000
dataframe size when lines with missing values dropped: (5, 12)


## Use a mean value for numerical data
http://scikit-learn.org/stable/modules/preprocessing.html#imputation-of-missing-values


In [143]:
import numpy as np
from sklearn.preprocessing import Imputer

strategy='mean' or 'median' or 'most_frequent'

In [144]:
# Missing value NaN
imp = Imputer(missing_values='NaN', strategy='mean', axis=0) # Mean of the column
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(X)

[[nan, 2], [6, nan], [7, 6]]


In [145]:
imp.fit(X)
print(imp.transform(X))  

[[ 6.5  2. ]
 [ 6.   4. ]
 [ 7.   6. ]]


In [146]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0) # Mean of the column
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.fit_transform(X))  

[[ 6.5  2. ]
 [ 6.   4. ]
 [ 7.   6. ]]


In [147]:
# Missing value 0
imp = Imputer(missing_values=0, strategy='mean', axis=0) # Mean of the column

In [148]:
Y=np.matrix('0 2; 6 0; 7 6')
print(Y)
imp.fit(Y)
print(imp.transform(Y))

[[0 2]
 [6 0]
 [7 6]]
[[ 6.5  2. ]
 [ 6.   4. ]
 [ 7.   6. ]]


Off-topic

In [149]:
# Off-topic: testing np matrix with NAN
Y=np.empty((3,3)) 
Y[:]=np.NAN
Y[0,1]=2
print(Y)


[[ nan   2.  nan]
 [ nan  nan  nan]
 [ nan  nan  nan]]


In [150]:
data = [
    ['a', 1, 2],
    ['b', 1, 1],
    ['b', 2, 2],
    [np.nan, np.nan, np.nan]
]
data

[['a', 1, 2], ['b', 1, 1], ['b', 2, 2], [nan, nan, nan]]