<a href="https://colab.research.google.com/github/LongNguyen1984/DeepLearning/blob/master/MarkingMissingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Marking missing values

In [3]:
# load and summarize the dataset
from pandas import read_csv
# load the dataset
dataset = read_csv('drive/My Drive/Dataset/diabetes.csv', header=None)
# summarize the dataset
print(dataset.describe())

                0           1           2  ...           6           7           8
count  768.000000  768.000000  768.000000  ...  768.000000  768.000000  768.000000
mean     3.845052  120.894531   69.105469  ...    0.471876   33.240885    0.348958
std      3.369578   31.972618   19.355807  ...    0.331329   11.760232    0.476951
min      0.000000    0.000000    0.000000  ...    0.078000   21.000000    0.000000
25%      1.000000   99.000000   62.000000  ...    0.243750   24.000000    0.000000
50%      3.000000  117.000000   72.000000  ...    0.372500   29.000000    0.000000
75%      6.000000  140.250000   80.000000  ...    0.626250   41.000000    1.000000
max     17.000000  199.000000  122.000000  ...    2.420000   81.000000    1.000000

[8 rows x 9 columns]


In [5]:
# count the number of missing values for each column
num_missing = (dataset[[1,2,3,4,5]]==0).sum()
# report the results
print(num_missing)

1      5
2     35
3    227
4    374
5     11
dtype: int64


In [6]:
# example of marking missing values with nan values
from numpy import nan
from pandas import read_csv
# load the dataset
dataset = read_csv('drive/My Drive/Dataset/diabetes.csv', header=None)
# repalce '0' values with 'nan'
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0,nan)
#c count the number of nan values in each column
print(dataset.isnull().sum())

0      0
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64


In [7]:
print(dataset.head(20))

     0      1     2     3      4     5      6   7  8
0    6  148.0  72.0  35.0    NaN  33.6  0.627  50  1
1    1   85.0  66.0  29.0    NaN  26.6  0.351  31  0
2    8  183.0  64.0   NaN    NaN  23.3  0.672  32  1
3    1   89.0  66.0  23.0   94.0  28.1  0.167  21  0
4    0  137.0  40.0  35.0  168.0  43.1  2.288  33  1
5    5  116.0  74.0   NaN    NaN  25.6  0.201  30  0
6    3   78.0  50.0  32.0   88.0  31.0  0.248  26  1
7   10  115.0   NaN   NaN    NaN  35.3  0.134  29  0
8    2  197.0  70.0  45.0  543.0  30.5  0.158  53  1
9    8  125.0  96.0   NaN    NaN   NaN  0.232  54  1
10   4  110.0  92.0   NaN    NaN  37.6  0.191  30  0
11  10  168.0  74.0   NaN    NaN  38.0  0.537  34  1
12  10  139.0  80.0   NaN    NaN  27.1  1.441  57  0
13   1  189.0  60.0  23.0  846.0  30.1  0.398  59  1
14   5  166.0  72.0  19.0  175.0  25.8  0.587  51  1
15   7  100.0   NaN   NaN    NaN  30.0  0.484  32  1
16   0  118.0  84.0  47.0  230.0  45.8  0.551  31  1
17   7  107.0  74.0   NaN    NaN  29.6  0.254 

# An example of missing values effect

In [8]:
# example where missing values cause errors
from numpy import nan
from pandas import read_csv
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# load the dataset
dataset = read_csv('drive/My Drive/Dataset/diabetes.csv', header=None)
# replace '0' values with 'nan'
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0,nan)
# split dataset into inputs and output
values = dataset.values
X = values[:,0:8]
y = values[:,8]
# define the model
model = LinearDiscriminantAnalysis()
# define the model evaluation procedure
cv = KFold(n_splits=3, shuffle=True, random_state=1)
# evaluate the model
result = cross_val_score(model, X, y,cv=cv,scoring ='accuracy')
# report the mean performance
print('Accuarcy: %.3f' %result.mean())


Accuarcy: nan


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').



# Remove Rows With Missing Values

In [11]:
# example of removing rows that contain missing value
from numpy import nan
from pandas import read_csv
# load the dataset
dataset = read_csv('drive/My Drive/Dataset/diabetes.csv', header=None)
# summarize the shape of raw data
print(dataset.shape)
# replace '0' values with 'nan'
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0,nan)
# drop rows with missing values
dataset.dropna(inplace=True)
# recheck the shape of dataset
print(dataset.describe())

(768, 9)
                0           1           2  ...           6           7           8
count  392.000000  392.000000  392.000000  ...  392.000000  392.000000  392.000000
mean     3.301020  122.627551   70.663265  ...    0.523046   30.864796    0.331633
std      3.211424   30.860781   12.496092  ...    0.345488   10.200777    0.471401
min      0.000000   56.000000   24.000000  ...    0.085000   21.000000    0.000000
25%      1.000000   99.000000   62.000000  ...    0.269750   23.000000    0.000000
50%      2.000000  119.000000   70.000000  ...    0.449500   27.000000    0.000000
75%      5.000000  143.000000   78.000000  ...    0.687000   36.000000    1.000000
max     17.000000  198.000000  110.000000  ...    2.420000   81.000000    1.000000

[8 rows x 9 columns]


In [13]:
# evalue model on data after rows with missing data are removed
from numpy import nan
from pandas import read_csv
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# load the dataset
dataset = read_csv('drive/My Drive/Dataset/diabetes.csv', header=None)
# replace '0' values with 'nan'
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0,nan)
# drop rows with missing value
dataset.dropna(inplace=True)
# Split dataset into inputs and outpust
X = dataset.values[:,0:8]
y = dataset.values[:,8]
# define the model
model = LinearDiscriminantAnalysis()
# define the model evaluation procedure
cv = KFold(n_splits=3, shuffle=True, random_state=1)
# evaluate the model
result = cross_val_score(model, X,y, cv=cv, scoring='accuracy')
# report the mean performance
print('Accuarcy: %.3f' % result.mean())

Accuarcy: 0.781
