# House prices 3: Filling missing data

In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

%matplotlib inline
warnings.filterwarnings("ignore")
sns.set()

In [118]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
dataframes = [train, test]

all_data = pd.concat(dataframes)
numerical = all_data.select_dtypes(include=np.number)
categorical = all_data.select_dtypes(exclude=np.number)

def writeCSV(number):
    train.to_csv("train"+str(number)+".csv")
    test.to_csv("test"+str(number)+".csv")

def updateJoinedData():
    global all_data, numerical, categorical
    all_data = pd.concat(dataframes)
    numerical = all_data.select_dtypes(include=np.number)
    categorical = all_data.select_dtypes(exclude=np.number)

In [119]:
#DISPLAY METHODS
def showNull(data):
    null_values = data.isnull().sum()
    index = 0
    for i in null_values:
        if i > 0:
            print(data.columns[index], "   ", i)
        index += 1
    print("\n", "(Columns: ", str(index)+")")
    
def showMean(data): #Print the mean of numerical columns
    numerical = data.select_dtypes(include=np.number)
    for column in numerical:
        serie = data[column].mean()
        print(column, serie)

def showMedian(data): #Print the mean of numerical columns
    numerical = data.select_dtypes(include=np.number)
    for column in numerical:
        serie = data[column].median()
        print(column, serie)
    
def showMode(data): #Print info about the mode of categorical data
    categorical = data.select_dtypes(exclude=np.number)
    for column in categorical:
        ex = data[column]
        print(ex.value_counts())
        print("")
        print(ex.mode())
        print("")
        print(ex.mode().values)
        print("\n")
        
#FILLING METHODS
def showNull(data): #Print missing values
    null_values = data.isnull().sum()
    index = 0
    for i in null_values:
        if i > 0:
            print(data.columns[index], "   ", i)
        index += 1
    print("\n", "(Columns: ", str(index)+")")

def showMean(data): #Print the mean of numerical columns
    numerical = data.select_dtypes(include=np.number)
    for column in numerical:
        serie = data[column].mean()
        print(column, serie)

def showMedian(data): #Print the mean of numerical columns
    numerical = data.select_dtypes(include=np.number)
    for column in numerical:
        serie = data[column].median()
        print(column, serie)

def showMode(data): #Print info about the mode of categorical data
    categorical = data.select_dtypes(exclude=np.number)
    for column in categorical:
        ex = data[column]
        print(ex.value_counts())
        print("")
        print(ex.mode())
        print("")
        print(ex.mode().values)
        print("\n")

def fillAsMean(data, column): #Fill missing values of a column with the mean
    serie = data[column].mean()
    data[column].fillna(value=serie, inplace=True)

def numericalAsMean(data, include=[], exclude=[]): #Fill the missing values of numerical data with the mean
    if include:
        numerical = include
    else:
        numerical = data.select_dtypes(include=np.number)
    for column in numerical:
        if column not in exclude:
            serie = data[column].mean()
            data[column].fillna(value=serie, inplace=True)

def fillAsMedian(data, column): #Fill m. values of a  column with the median
    serie = data[column].median()
    data[column].fillna(value=serie, inplace=True)

def numericalAsMedian(data, include=[], exclude=[]): #Fill the m. values of numerical data with median
    if include:
        numerical = include
    else:
        numerical = data.select_dtypes(include=np.number)
    for column in numerical:
        if column not in exclude:
            serie = data[column].median()
            data[column].fillna(value=serie, inplace=True)

def fillAsMode(data, column): #Fill m. values of a column with the mode
    array = data[column].mode().values
    serie = array[0]
    data[column].fillna(value=serie, inplace=True)

def categoricalAsMode(data): #Fill m. values of categorical data with the mode
    categorical = data.select_dtypes(exclude=np.number)
    for column in categorical:
        array = data[column].mode().values
        serie = array[0]
        data[column].fillna(value=serie, inplace=True)

In [120]:
showNull(numerical)

LotFrontage     486
MasVnrArea     23
BsmtFinSF1     1
BsmtFinSF2     1
BsmtUnfSF     1
TotalBsmtSF     1
BsmtFullBath     2
BsmtHalfBath     2
GarageYrBlt     159
GarageCars     1
GarageArea     1
SalePrice     1459

 (Columns:  38)


In [121]:
showNull(categorical)

MSZoning     4
Alley     2721
Utilities     2
Exterior1st     1
Exterior2nd     1
MasVnrType     24
BsmtQual     81
BsmtCond     82
BsmtExposure     82
BsmtFinType1     79
BsmtFinType2     80
Electrical     1
KitchenQual     1
Functional     2
FireplaceQu     1420
GarageType     157
GarageFinish     159
GarageQual     159
GarageCond     159
PoolQC     2909
Fence     2348
MiscFeature     2814
SaleType     1

 (Columns:  43)


In [122]:
showMode(train)

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64

0    RL
dtype: object

['RL']


Pave    1454
Grvl       6
Name: Street, dtype: int64

0    Pave
dtype: object

['Pave']


Grvl    50
Pave    41
Name: Alley, dtype: int64

0    Grvl
dtype: object

['Grvl']


Reg    925
IR1    484
IR2     41
IR3     10
Name: LotShape, dtype: int64

0    Reg
dtype: object

['Reg']


Lvl    1311
Bnk      63
HLS      50
Low      36
Name: LandContour, dtype: int64

0    Lvl
dtype: object

['Lvl']


AllPub    1459
NoSeWa       1
Name: Utilities, dtype: int64

0    AllPub
dtype: object

['AllPub']


Inside     1052
Corner      263
CulDSac      94
FR2          47
FR3           4
Name: LotConfig, dtype: int64

0    Inside
dtype: object

['Inside']


Gtl    1382
Mod      65
Sev      13
Name: LandSlope, dtype: int64

0    Gtl
dtype: object

['Gtl']


NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sa

In [123]:
for frame in dataframes:
    numericalAsMean(frame, exclude=["SalePrice"])
    categoricalAsMode(frame)

In [130]:
for frame in dataframes:
    showNull(frame)


 (Columns:  81)

 (Columns:  80)


In [129]:
writeCSV(2)