# Data Cleaning and Pre Processing

Here we take in the cleveland.data file from UCI Heart Disease database and format it to final CSV file which we can use for analysis.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Attributes Assigning

The following cell shows all attributes and selected 14 attributes

In [2]:
# Names of all attributes in UCI Heart Disease Dataset
headers = [
    "id", "ccf", "age", "sex",  "painloc", "painexer", "relrest", "pncaden", "cp", "trestbps",  "htn", "chol", "smoke", "cigs", "years", "fbs",  "dm", "famhist", "restecg",
    "ekgmo", "ekgday", "ekgyr", "dig",  "prop", "nitr", "pro", "diuretic", "proto", "thaldur",  "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd",  "dummy", "trestbpd", "exang",
    "xhypo", "oldpeak", "slope", "rldv5",  "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm",  "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe",  "cmo", "cday", "cyr",
    "num", "lmt", "ladprox", "laddist",  "diag", "cxmain", "ramus", "om1", "om2", "rcaprox",  "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf",  "cathef", "junk", "name"
]

# Names of all attributes in short version of UCI dataset
selected_columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"
]

## Dataset Format

The datafile contains contiguous elements seperated by a newline or space. Each row of the required dataset is observed to be 10 lines in the datafile in the following format:
1. The first line contains 7 elements
2. Lines 2-9 contains 8 elements
3. Line 10 contains 5 elements.
4. The last element belongs to attribute Name and contains string 'name' always

This information is used for data cleaning

## Finding Errors in Data / Missing Data

In [3]:
# Opens cleveland.data, iterates lines, takes in data and stores as a list of rows
datafile = open('raw_data/cleveland.data', 'r', errors='ignore')

wrong_data_lines = list() # Stores datalines with incorrect size of elements
wrong_name_lines = list() # Stores datalines with wrong name attribute

i = 1

for line in datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1

print(i)

datafile.close()

2983


In [4]:
print("Lines with wrong number of elements are: ")
wrong_name_lines

Lines with wrong number of elements are: 


[{'line_no': 2830, 'line_size': 7},
 {'line_no': 2840, 'line_size': 16},
 {'line_no': 2850, 'line_size': 3},
 {'line_no': 2860, 'line_size': 11},
 {'line_no': 2870, 'line_size': 8},
 {'line_no': 2880, 'line_size': 16},
 {'line_no': 2890, 'line_size': 5},
 {'line_no': 2900, 'line_size': 4},
 {'line_no': 2910, 'line_size': 3},
 {'line_no': 2930, 'line_size': 4},
 {'line_no': 2940, 'line_size': 1},
 {'line_no': 2950, 'line_size': 8},
 {'line_no': 2960, 'line_size': 4},
 {'line_no': 2970, 'line_size': 2},
 {'line_no': 2980, 'line_size': 7}]

In [5]:
print("Lines with wrong name attribute: ")
wrong_data_lines

Lines with wrong name attribute: 


[{'line_no': 2822, 'line_size': 11},
 {'line_no': 2823, 'line_size': 4},
 {'line_no': 2824, 'line_size': 2},
 {'line_no': 2825, 'line_size': 3},
 {'line_no': 2826, 'line_size': 3},
 {'line_no': 2827, 'line_size': 15},
 {'line_no': 2828, 'line_size': 1},
 {'line_no': 2829, 'line_size': 3},
 {'line_no': 2830, 'line_size': 7},
 {'line_no': 2831, 'line_size': 11},
 {'line_no': 2832, 'line_size': 3},
 {'line_no': 2833, 'line_size': 5},
 {'line_no': 2834, 'line_size': 6},
 {'line_no': 2835, 'line_size': 5},
 {'line_no': 2836, 'line_size': 5},
 {'line_no': 2837, 'line_size': 10},
 {'line_no': 2838, 'line_size': 9},
 {'line_no': 2839, 'line_size': 7},
 {'line_no': 2840, 'line_size': 16},
 {'line_no': 2841, 'line_size': 3},
 {'line_no': 2842, 'line_size': 5},
 {'line_no': 2843, 'line_size': 13},
 {'line_no': 2844, 'line_size': 4},
 {'line_no': 2845, 'line_size': 2},
 {'line_no': 2846, 'line_size': 9},
 {'line_no': 2847, 'line_size': 10},
 {'line_no': 2848, 'line_size': 12},
 {'line_no': 2849, '

## Inference

Here we observe that rows after 2820 (282 data points) have lots of errors. This can also be seen by manually checking the data file as well. So we only consider until line 2820 in the data file

## Creating the Dataset

In [6]:
datafile = open('raw_data/cleveland.data', 'r', errors='ignore')

dataset = list()
datarow = list()

i = 0

for line in datafile:
    i += 1
    if i > 2820:
        break
    linecontent = line.split()
    datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = datarow.copy()
        if len(row_clone)== 76:
            dataset.append(row_clone)
        else:
            print(row_clone)
        datarow.clear()
        
datafile.close()

In [7]:
print("No of datapoints: ", len(dataset))

No of datapoints:  282


In [8]:
df = pd.DataFrame(dataset, columns=headers)
# The first version of the dataset is a direct raw conversion of the .data file to .csv file
df.to_csv('datasets/cleveland_raw.csv', index=False) 
datafile.close()

In [9]:
del df

In [10]:
data = pd.read_csv('datasets/cleveland_raw.csv')

In [11]:
data.head()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
0,1,0,63,1,-9,-9,-9,-9,1,145,...,1,1,1,1,1,1,1,-9,-9,name
1,2,0,67,1,-9,-9,-9,-9,4,160,...,1,1,1,1,1,1,1,-9,-9,name
2,3,0,67,1,-9,-9,-9,-9,4,120,...,2,2,1,1,1,7,3,-9,-9,name
3,4,0,37,1,-9,-9,-9,-9,3,130,...,1,1,1,1,1,1,1,-9,-9,name
4,6,0,41,0,-9,-9,-9,-9,2,130,...,1,1,1,1,1,1,1,-9,-9,name


In [12]:
data.describe()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,...,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0
mean,151.51773,0.0,54.411348,0.677305,-9.0,-9.0,-9.0,-9.0,3.163121,131.56383,...,-9.0,1.173759,1.12766,1.0,1.0,1.134752,1.411348,1.14539,-9.0,-9.0
std,87.131234,0.0,9.053083,0.468338,0.0,0.0,0.0,0.0,0.955405,17.757496,...,0.0,0.379576,0.334304,0.0,0.0,0.766002,1.439508,0.44257,0.0,0.0
min,1.0,0.0,29.0,0.0,-9.0,-9.0,-9.0,-9.0,1.0,94.0,...,-9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
25%,75.25,0.0,48.0,0.0,-9.0,-9.0,-9.0,-9.0,3.0,120.0,...,-9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
50%,151.5,0.0,55.0,1.0,-9.0,-9.0,-9.0,-9.0,3.0,130.0,...,-9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
75%,227.75,0.0,61.0,1.0,-9.0,-9.0,-9.0,-9.0,4.0,140.0,...,-9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
max,298.0,0.0,77.0,1.0,-9.0,-9.0,-9.0,-9.0,4.0,200.0,...,-9.0,2.0,2.0,1.0,1.0,8.0,8.0,4.0,-9.0,-9.0


In [13]:
data = data.replace(-9, np.nan)
data.describe()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,282.0,282.0,282.0,282.0,0.0,0.0,0.0,0.0,282.0,282.0,...,0.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,0.0,0.0
mean,151.51773,0.0,54.411348,0.677305,,,,,3.163121,131.56383,...,,1.173759,1.12766,1.0,1.0,1.134752,1.411348,1.14539,,
std,87.131234,0.0,9.053083,0.468338,,,,,0.955405,17.757496,...,,0.379576,0.334304,0.0,0.0,0.766002,1.439508,0.44257,,
min,1.0,0.0,29.0,0.0,,,,,1.0,94.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
25%,75.25,0.0,48.0,0.0,,,,,3.0,120.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
50%,151.5,0.0,55.0,1.0,,,,,3.0,130.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
75%,227.75,0.0,61.0,1.0,,,,,4.0,140.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
max,298.0,0.0,77.0,1.0,,,,,4.0,200.0,...,,2.0,2.0,1.0,1.0,8.0,8.0,4.0,,


In [14]:
# After replacing -9 with nan, a cleveland_nan.csv is created
data.to_csv('datasets/cleveland_nan.csv', index=False)

# Removing ID and Name

Now, we have the complete dataset as a dataframe. 

The 'name' field and 'id' field provide no practical use and should be dropped

In [15]:
data_1 = data.drop(['id', 'name'], axis = 1)

In [16]:
data_1.describe()

Unnamed: 0,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,282.0,282.0,282.0,0.0,0.0,0.0,0.0,282.0,282.0,282.0,...,0.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,0.0,0.0
mean,0.0,54.411348,0.677305,,,,,3.163121,131.56383,0.617021,...,,1.173759,1.12766,1.0,1.0,1.134752,1.411348,1.14539,,
std,0.0,9.053083,0.468338,,,,,0.955405,17.757496,0.486977,...,,0.379576,0.334304,0.0,0.0,0.766002,1.439508,0.44257,,
min,0.0,29.0,0.0,,,,,1.0,94.0,0.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
25%,0.0,48.0,0.0,,,,,3.0,120.0,0.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
50%,0.0,55.0,1.0,,,,,3.0,130.0,1.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
75%,0.0,61.0,1.0,,,,,4.0,140.0,1.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
max,0.0,77.0,1.0,,,,,4.0,200.0,1.0,...,,2.0,2.0,1.0,1.0,8.0,8.0,4.0,,


In [17]:
data_1.head()

Unnamed: 0,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
0,0,63,1,,,,,1,145,1,...,,1,1,1,1,1,1,1,,
1,0,67,1,,,,,4,160,1,...,,1,1,1,1,1,1,1,,
2,0,67,1,,,,,4,120,1,...,,2,2,1,1,1,7,3,,
3,0,37,1,,,,,3,130,0,...,,1,1,1,1,1,1,1,,
4,0,41,0,,,,,2,130,1,...,,1,1,1,1,1,1,1,,


In [18]:
# cleveland_no_id.csv contains all attributes except name and id
data_1.to_csv('datasets/cleveland_no_id.csv', index=False)

The dataset containing no id or name is saved as 'cleveland_no_id.csv'

# Removing attributes with constant value

We can drop attributes which has the same value in all rows. So we find the attributes which have the same value throughout. We also drop columns with all NaN values

In [19]:
drop_cols = list()

for col in data_1.columns.values:
    if data_1[col].std() == 0:
        drop_cols.append(col)
        
print("Columns to be dropped: ", drop_cols)
print("No of rows to be dropped: ", len(drop_cols))

Columns to be dropped:  ['ccf', 'dm', 'proto', 'lvx1', 'lvx2']
No of rows to be dropped:  5


In [20]:
for col in data_1.columns.values:
    if np.isnan(data_1[col].std()):
        drop_cols.append(col)
        
print(drop_cols)
print("No of rows to be dropped: ", len(drop_cols))

['ccf', 'dm', 'proto', 'lvx1', 'lvx2', 'painloc', 'painexer', 'relrest', 'pncaden', 'smoke', 'rldv5', 'restckm', 'exerckm', 'restef', 'restwm', 'exeref', 'exerwm', 'thalsev', 'thalpul', 'earlobe', 'diag', 'ramus', 'om2', 'cathef', 'junk']
No of rows to be dropped:  25


In [21]:
data_prefinal = data_1.drop(drop_cols, axis=1)

In [22]:
data_prefinal.head()

Unnamed: 0,age,sex,cp,trestbps,htn,chol,cigs,years,fbs,famhist,...,lmt,ladprox,laddist,cxmain,om1,rcaprox,rcadist,lvx3,lvx4,lvf
0,63,1,1,145,1,233,50.0,20.0,1,1,...,1,1,1,1,1,1,1,1,1,1
1,67,1,4,160,1,286,40.0,40.0,0,1,...,1,2,2,2,1,1,1,1,1,1
2,67,1,4,120,1,229,20.0,35.0,0,1,...,1,1,1,1,1,2,2,1,7,3
3,37,1,3,130,0,250,0.0,0.0,0,1,...,1,1,1,1,1,1,1,1,1,1
4,41,0,2,130,1,204,0.0,0.0,0,1,...,1,1,1,1,1,1,1,1,1,1


In [23]:
data_prefinal.describe()

Unnamed: 0,age,sex,cp,trestbps,htn,chol,cigs,years,fbs,famhist,...,lmt,ladprox,laddist,cxmain,om1,rcaprox,rcadist,lvx3,lvx4,lvf
count,282.0,282.0,282.0,282.0,282.0,282.0,277.0,277.0,282.0,282.0,...,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0
mean,54.411348,0.677305,3.163121,131.56383,0.617021,249.092199,16.920578,15.259928,0.148936,0.620567,...,1.042553,1.141844,1.205674,1.156028,1.163121,1.173759,1.12766,1.134752,1.411348,1.14539
std,9.053083,0.468338,0.955405,17.757496,0.486977,51.217546,19.451934,15.367867,0.356658,0.486108,...,0.202206,0.34951,0.404912,0.363527,0.370132,0.379576,0.334304,0.766002,1.439508,0.44257
min,29.0,0.0,1.0,94.0,0.0,126.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,48.0,0.0,3.0,120.0,0.0,213.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,55.0,1.0,3.0,130.0,1.0,244.0,10.0,15.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,61.0,1.0,4.0,140.0,1.0,277.0,30.0,30.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,77.0,1.0,4.0,200.0,1.0,564.0,99.0,54.0,1.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,8.0,8.0,4.0


# Creating a dataset of only selected 14 attributes

Usually only 14 attributes are used for data analysis of this particular dataset. For general testing, this is also taken and considered.

In [24]:
data_selected = data_prefinal[selected_columns]

In [25]:
data_selected.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [26]:
data_selected.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,280.0,280.0,282.0
mean,54.411348,0.677305,3.163121,131.56383,249.092199,0.148936,1.014184,149.765957,0.326241,1.02695,1.585106,0.664286,4.678571,0.907801
std,9.053083,0.468338,0.955405,17.757496,51.217546,0.356658,0.998118,22.923869,0.46967,1.138825,0.6097,0.936023,1.939101,1.224894
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,3.0,0.0
50%,55.0,1.0,3.0,130.0,244.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,277.0,0.0,2.0,165.75,1.0,1.6,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,4.0


In [27]:
cols_with_nan = data_selected.columns.values[data_selected.isna().any()]
display(cols_with_nan)

array(['ca', 'thal'], dtype=object)

In [28]:
data_selected[cols_with_nan].describe()

Unnamed: 0,ca,thal
count,280.0,280.0
mean,0.664286,4.678571
std,0.936023,1.939101
min,0.0,3.0
25%,0.0,3.0
50%,0.0,3.0
75%,1.0,7.0
max,3.0,7.0


In [29]:
nan_count = dict()

for col in cols_with_nan:
    nan_count[col] = data_selected[col].isna().sum()
    print('Column Name: ', col)
    print('NaN Count: ', nan_count[col])
    print(data_selected[col].value_counts())
    print("\n")

Column Name:  ca
NaN Count:  2
0.0    166
1.0     61
2.0     34
3.0     19
Name: ca, dtype: int64


Column Name:  thal
NaN Count:  2
3.0    159
7.0    107
6.0     14
Name: thal, dtype: int64




In [30]:
print("No of rows with nan : ", sum([True for index,row in data_selected.iterrows() if any(row.isnull())]))

No of rows with nan :  4


## Filling NaN with Other Values

Here we fill NaN with other values in the column using probability of finding the value in the column as the criteria

In [31]:
data_selected_mean = data_selected.copy()
data_selected_ratio = data_selected.copy()
data_selected_mode = data_selected.copy()
data_selected_median = data_selected.copy()

## Imputing with proportion of each value in dataset

### ca attribute

In [32]:
ca_duplicate = data_selected_ratio['ca'].copy()

for index, val in ca_duplicate.iteritems():
    if np.isnan(val):
        print(index, val)

166 nan
192 nan


In [33]:
ca_probabilities = []

for index, val in data_selected_ratio['ca'].value_counts().iteritems():
    ca_probabilities.append(round(val/data_selected_ratio['ca'].value_counts().sum(), 2))
    
ca_values = data_selected_ratio['ca'].value_counts().index.values.tolist()

ca_duplicate = ca_duplicate.fillna(pd.Series(np.random.choice(ca_values, 
                                                      p=ca_probabilities, size=len(ca_duplicate))))

In [34]:
for index, val in data_selected_ratio['ca'].iteritems():
    if ca_duplicate[index] != val:
        print("Index: ", index, ", ca duplicate value: ", ca_duplicate[index], ", original value: ", val)

Index:  166 , ca duplicate value:  1.0 , original value:  nan
Index:  192 , ca duplicate value:  0.0 , original value:  nan


In [35]:
data_selected_ratio['ca'] = ca_duplicate.copy()

### thal attribute

In [36]:
thal_duplicate = data_selected_ratio['thal'].copy()

for index, val in thal_duplicate.iteritems():
    if np.isnan(val):
        print(index, val)

87 nan
266 nan


In [37]:
thal_probabilities = []

for index, val in data_selected_ratio['thal'].value_counts().iteritems():
    thal_probabilities.append(round(val/data_selected_ratio['thal'].value_counts().sum(), 2))
    
thal_values = data_selected_ratio['thal'].value_counts().index.values.tolist()

thal_duplicate = thal_duplicate.fillna(pd.Series(np.random.choice(thal_values, 
                                                              p=thal_probabilities, size=len(thal_duplicate))))

In [38]:
for index, val in data_selected_ratio['thal'].iteritems():
    if thal_duplicate[index] != val:
        print("Index: ", index, ", thal duplicate value: ", thal_duplicate[index], ", original value: ", val)

Index:  87 , thal duplicate value:  7.0 , original value:  nan
Index:  266 , thal duplicate value:  6.0 , original value:  nan


In [39]:
data_selected_ratio['thal'] = thal_duplicate.copy()

In [40]:
data_selected_ratio.to_csv('datasets/cleveland_short.csv', index=False)

## Replacing with Mean

In [41]:
for col in cols_with_nan:
    col_mean_value = data_selected_mean[col].mean()
    print("Mean of", col, "attribute:", col_mean_value)
    data_selected_mean[col].fillna(value = col_mean_value, inplace = True)
    comparison_array = np.where(data_selected_mean[col] == data_selected[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mean of ca attribute: 0.6642857142857143
No of changes in col: ca :  2
Mean of thal attribute: 4.678571428571429
No of changes in col: thal :  2


In [42]:
# data_selected_mean.to_csv('datasets/cleveland_short.csv', index=False)

## Replacing with Mode

In [43]:
for col in cols_with_nan:
    col_mode_value = data_selected_mode[col].mode()
    print("Mode of", col, "attribute:", col_mode_value)
    data_selected_mode[col].fillna(value = col_mode_value, inplace = True)
    comparison_array = np.where(data_selected_mode[col] == data_selected[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of ca attribute: 0    0.0
dtype: float64
No of changes in col: ca :  2
Mode of thal attribute: 0    3.0
dtype: float64
No of changes in col: thal :  2


In [44]:
# data_selected_mode.to_csv('datasets/cleveland_short.csv', index=False)

## Replacing with Median

In [45]:
for col in cols_with_nan:
    col_median_value = data_selected_median[col].median()
    print("Median of", col, "attribute:", col_median_value)
    data_selected_median[col].fillna(value = col_median_value, inplace = True)
    comparison_array = np.where(data_selected_median[col] == data_selected[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Median of ca attribute: 0.0
No of changes in col: ca :  2
Median of thal attribute: 3.0
No of changes in col: thal :  2


In [46]:
# data_selected_median.to_csv('datasets/cleveland_short.csv', index=False)

## Processing High Dimensional Data with NaN values 

Considering for NaN values and figuring out which columns to remove and which to keep

## Examining NaN values in columns to find which ones to keep and which ones to drop

In [47]:
cols_with_nan_large = data_prefinal.columns.values[data_prefinal.isna().any()]
display(cols_with_nan_large)

array(['cigs', 'years', 'dig', 'prop', 'nitr', 'pro', 'diuretic',
       'thaltime', 'ca', 'thal'], dtype=object)

In [48]:
data_prefinal[cols_with_nan_large].describe()

Unnamed: 0,cigs,years,dig,prop,nitr,pro,diuretic,thaltime,ca,thal
count,277.0,277.0,280.0,280.0,280.0,280.0,280.0,213.0,280.0,280.0
mean,16.920578,15.259928,0.032143,0.335714,0.246429,0.1,0.114286,4.882629,0.664286,4.678571
std,19.451934,15.367867,0.176695,0.473085,0.431703,0.300537,0.318728,3.465331,0.936023,1.939101
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0
50%,10.0,15.0,0.0,0.0,0.0,0.0,0.0,5.5,0.0,3.0
75%,30.0,30.0,0.0,1.0,0.0,0.0,0.0,7.0,1.0,7.0
max,99.0,54.0,1.0,1.0,1.0,1.0,1.0,15.0,3.0,7.0


In [49]:
nan_count_large = dict()

for col in cols_with_nan_large:
    nan_count_large[col] = data_prefinal[col].isna().sum()
    print('Column Name: ', col)
    print('NaN Count: ', nan_count_large[col])
    print(data_prefinal[col].value_counts())
    print("\n")

Column Name:  cigs
NaN Count:  5
0.0     115
20.0     51
40.0     30
30.0     20
10.0     12
50.0      9
60.0      9
2.0       6
25.0      6
15.0      4
3.0       2
80.0      2
35.0      2
9.0       1
1.0       1
70.0      1
99.0      1
5.0       1
8.0       1
4.0       1
75.0      1
28.0      1
Name: cigs, dtype: int64


Column Name:  years
NaN Count:  5
0.0     115
20.0     32
30.0     27
25.0     15
40.0     15
35.0     12
15.0      9
50.0      4
6.0       3
18.0      3
10.0      3
4.0       3
24.0      3
1.0       3
5.0       2
34.0      2
29.0      2
23.0      2
32.0      2
22.0      2
38.0      2
31.0      1
41.0      1
14.0      1
28.0      1
17.0      1
48.0      1
33.0      1
37.0      1
36.0      1
7.0       1
47.0      1
45.0      1
54.0      1
19.0      1
8.0       1
27.0      1
Name: years, dtype: int64


Column Name:  dig
NaN Count:  2
0.0    271
1.0      9
Name: dig, dtype: int64


Column Name:  prop
NaN Count:  2
0.0    186
1.0     94
Name: prop, dtype: int64


Column N

### thaltime needs to be dropped

All other attributes have a very small percentage of nan values. thaltime has about 25% nan values and a very uneven distribution of data too. We cannot change that easily and add duplicate values like mean, mode or even random values

In [50]:
data_final = data_prefinal.drop(['thaltime'], axis = 1)
data_final.describe()

Unnamed: 0,age,sex,cp,trestbps,htn,chol,cigs,years,fbs,famhist,...,lmt,ladprox,laddist,cxmain,om1,rcaprox,rcadist,lvx3,lvx4,lvf
count,282.0,282.0,282.0,282.0,282.0,282.0,277.0,277.0,282.0,282.0,...,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0
mean,54.411348,0.677305,3.163121,131.56383,0.617021,249.092199,16.920578,15.259928,0.148936,0.620567,...,1.042553,1.141844,1.205674,1.156028,1.163121,1.173759,1.12766,1.134752,1.411348,1.14539
std,9.053083,0.468338,0.955405,17.757496,0.486977,51.217546,19.451934,15.367867,0.356658,0.486108,...,0.202206,0.34951,0.404912,0.363527,0.370132,0.379576,0.334304,0.766002,1.439508,0.44257
min,29.0,0.0,1.0,94.0,0.0,126.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,48.0,0.0,3.0,120.0,0.0,213.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,55.0,1.0,3.0,130.0,1.0,244.0,10.0,15.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,61.0,1.0,4.0,140.0,1.0,277.0,30.0,30.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,77.0,1.0,4.0,200.0,1.0,564.0,99.0,54.0,1.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,8.0,8.0,4.0


In [51]:
data_final_ratio = data_final.copy()
data_final_mean = data_final.copy()
data_final_mode = data_final.copy()
data_final_median = data_final.copy()

In [52]:
print("No of rows with nan : ", sum([True for index,row in data_final.iterrows() if any(row.isnull())]))

No of rows with nan :  12


## Imputing with proportion of each value in dataset

In [53]:
cols_with_nan_large = data_final_ratio.columns.values[data_final_ratio.isna().any()]

for col in cols_with_nan_large:
    print("Col Name: ", col, "")
    
    col_duplicate = data_final_ratio[col].copy()
    
    for index, val in col_duplicate.iteritems():
        if np.isnan(val):
            print(index, val)
        
    col_probabilities = []
        
    for index, val in col_duplicate.value_counts().iteritems():
        col_probabilities.append(round(val/col_duplicate.value_counts().sum(), 2))
    
    for i in range(len(col_probabilities)):
        if sum(col_probabilities) >= 1:
            break
        col_probabilities[i] += 0.01
    
    col_values = col_duplicate.value_counts().index.values.tolist()
        
    col_duplicate = col_duplicate.fillna(pd.Series(np.random.choice(col_values, 
                                                                      p=col_probabilities, size=len(col_duplicate))))
        
    for index, val in data_final_ratio[col].iteritems():
        if col_duplicate[index] != val:
            print("Index: ", index, ", ", col, " duplicate value: ", col_duplicate[index], ", original value: ", val)
            
    data_final_ratio[col] = col_duplicate.copy()
    
    print(col, "done!!!\n")

Col Name:  cigs 
21 nan
23 nan
182 nan
195 nan
250 nan
Index:  21 ,  cigs  duplicate value:  0.0 , original value:  nan
Index:  23 ,  cigs  duplicate value:  30.0 , original value:  nan
Index:  182 ,  cigs  duplicate value:  40.0 , original value:  nan
Index:  195 ,  cigs  duplicate value:  20.0 , original value:  nan
Index:  250 ,  cigs  duplicate value:  0.0 , original value:  nan
cigs done!!!

Col Name:  years 
21 nan
23 nan
159 nan
195 nan
250 nan
Index:  21 ,  years  duplicate value:  0.0 , original value:  nan
Index:  23 ,  years  duplicate value:  40.0 , original value:  nan
Index:  159 ,  years  duplicate value:  20.0 , original value:  nan
Index:  195 ,  years  duplicate value:  0.0 , original value:  nan
Index:  250 ,  years  duplicate value:  0.0 , original value:  nan
years done!!!

Col Name:  dig 
121 nan
207 nan
Index:  121 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  207 ,  dig  duplicate value:  0.0 , original value:  nan
dig done!!!

Col Name:  prop 
12

In [54]:
data_final_ratio.to_csv('datasets/cleveland_large.csv', index=False)

## Replacing with Mean

In [55]:
for col in cols_with_nan_large:
    col_mean_value = data_final_mean[col].mean()
    print("Mean of", col, "attribute:", col_mean_value)
    data_final_mean[col].fillna(value = col_mean_value, inplace = True)
    comparison_array = np.where(data_final_mean[col] == data_final[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mean of cigs attribute: 16.92057761732852
No of changes in col: cigs :  5
Mean of years attribute: 15.259927797833935
No of changes in col: years :  5
Mean of dig attribute: 0.03214285714285714
No of changes in col: dig :  2
Mean of prop attribute: 0.3357142857142857
No of changes in col: prop :  2
Mean of nitr attribute: 0.24642857142857144
No of changes in col: nitr :  2
Mean of pro attribute: 0.1
No of changes in col: pro :  2
Mean of diuretic attribute: 0.11428571428571428
No of changes in col: diuretic :  2
Mean of ca attribute: 0.6642857142857143
No of changes in col: ca :  2
Mean of thal attribute: 4.678571428571429
No of changes in col: thal :  2


In [56]:
# data_final_mean.to_csv('datasets/cleveland_large.csv', index=False)

## Replacing with Mode

In [57]:
for col in cols_with_nan_large:
    col_mode_value = data_final_mode[col].mode()
    print("Mode of", col, "attribute:", col_mode_value)
    data_final_mode[col].fillna(value = col_mode_value, inplace = True)
    comparison_array = np.where(data_final_mode[col] == data_final[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of cigs attribute: 0    0.0
dtype: float64
No of changes in col: cigs :  5
Mode of years attribute: 0    0.0
dtype: float64
No of changes in col: years :  5
Mode of dig attribute: 0    0.0
dtype: float64
No of changes in col: dig :  2
Mode of prop attribute: 0    0.0
dtype: float64
No of changes in col: prop :  2
Mode of nitr attribute: 0    0.0
dtype: float64
No of changes in col: nitr :  2
Mode of pro attribute: 0    0.0
dtype: float64
No of changes in col: pro :  2
Mode of diuretic attribute: 0    0.0
dtype: float64
No of changes in col: diuretic :  2
Mode of ca attribute: 0    0.0
dtype: float64
No of changes in col: ca :  2
Mode of thal attribute: 0    3.0
dtype: float64
No of changes in col: thal :  2


In [58]:
# data_final_mode.to_csv('datasets/cleveland_large.csv', index=False)

## Replacing with Median

In [59]:
for col in cols_with_nan_large:
    col_median_value = data_final_median[col].median()
    print("Mode of", col, "attribute:", col_median_value)
    data_final_median[col].fillna(value = col_median_value, inplace = True)
    comparison_array = np.where(data_final_median[col] == data_final[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of cigs attribute: 10.0
No of changes in col: cigs :  5
Mode of years attribute: 15.0
No of changes in col: years :  5
Mode of dig attribute: 0.0
No of changes in col: dig :  2
Mode of prop attribute: 0.0
No of changes in col: prop :  2
Mode of nitr attribute: 0.0
No of changes in col: nitr :  2
Mode of pro attribute: 0.0
No of changes in col: pro :  2
Mode of diuretic attribute: 0.0
No of changes in col: diuretic :  2
Mode of ca attribute: 0.0
No of changes in col: ca :  2
Mode of thal attribute: 3.0
No of changes in col: thal :  2


In [60]:
# data_final_median.to_csv('datasets/cleveland_large.csv', index=False)