# Data Cleaning and Processing

Here we take in all the .data files in Heart Disease UCI database and format them to final CSV file which we can use for analysis.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Attributes Assigning

The following cell shows all attributes and selected 14 attributes

In [2]:
# Names of all attributes in UCI Heart Disease Dataset
headers = [
    "id", "ccf", "age", "sex",  "painloc", "painexer", "relrest", "pncaden", "cp", "trestbps",  "htn", "chol", "smoke", "cigs", "years", "fbs",  "dm", "famhist", "restecg",
    "ekgmo", "ekgday", "ekgyr", "dig",  "prop", "nitr", "pro", "diuretic", "proto", "thaldur",  "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd",  "dummy", "trestbpd", "exang",
    "xhypo", "oldpeak", "slope", "rldv5",  "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm",  "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe",  "cmo", "cday", "cyr",
    "num", "lmt", "ladprox", "laddist",  "diag", "cxmain", "ramus", "om1", "om2", "rcaprox",  "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf",  "cathef", "junk", "name"
]

# Names of all attributes in short version of UCI dataset
selected_columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"
]

## Dataset Format

The datafile contains contiguous elements seperated by a newline or space. Each row of the required dataset is observed to be 10 lines in the datafile in the following format:
1. The first line contains 7 elements
2. Lines 2-9 contains 8 elements
3. Line 10 contains 5 elements.
4. The last element belongs to attribute Name and contains string 'name' always

This information is used for data cleaning

## Finding Errors in Data / Missing Data

### Cleveland Dataset

In [3]:
# Opens cleveland.data, iterates lines, takes in data and stores as a list of rows

cleveland_datafile = open('raw_data/cleveland.data', 'r', errors='ignore')

cleveland_wrong_data_lines = list() # Stores datalines with incorrect size of elements
cleveland_wrong_name_lines = list() # Stores datalines with wrong name attribute

i = 1

for line in cleveland_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            cleveland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            cleveland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            cleveland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            cleveland_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1
    
print(i)

cleveland_datafile.close()

2983


In [4]:
print("Lines with wrong number of elements are: ")
cleveland_wrong_data_lines

Lines with wrong number of elements are: 


[{'line_no': 2822, 'line_size': 11},
 {'line_no': 2823, 'line_size': 4},
 {'line_no': 2824, 'line_size': 2},
 {'line_no': 2825, 'line_size': 3},
 {'line_no': 2826, 'line_size': 3},
 {'line_no': 2827, 'line_size': 15},
 {'line_no': 2828, 'line_size': 1},
 {'line_no': 2829, 'line_size': 3},
 {'line_no': 2830, 'line_size': 7},
 {'line_no': 2831, 'line_size': 11},
 {'line_no': 2832, 'line_size': 3},
 {'line_no': 2833, 'line_size': 5},
 {'line_no': 2834, 'line_size': 6},
 {'line_no': 2835, 'line_size': 5},
 {'line_no': 2836, 'line_size': 5},
 {'line_no': 2837, 'line_size': 10},
 {'line_no': 2838, 'line_size': 9},
 {'line_no': 2839, 'line_size': 7},
 {'line_no': 2840, 'line_size': 16},
 {'line_no': 2841, 'line_size': 3},
 {'line_no': 2842, 'line_size': 5},
 {'line_no': 2843, 'line_size': 13},
 {'line_no': 2844, 'line_size': 4},
 {'line_no': 2845, 'line_size': 2},
 {'line_no': 2846, 'line_size': 9},
 {'line_no': 2847, 'line_size': 10},
 {'line_no': 2848, 'line_size': 12},
 {'line_no': 2849, '

In [5]:
print("Lines with wrong name attribute: ")
cleveland_wrong_name_lines

Lines with wrong name attribute: 


[{'line_no': 2830, 'line_size': 7},
 {'line_no': 2840, 'line_size': 16},
 {'line_no': 2850, 'line_size': 3},
 {'line_no': 2860, 'line_size': 11},
 {'line_no': 2870, 'line_size': 8},
 {'line_no': 2880, 'line_size': 16},
 {'line_no': 2890, 'line_size': 5},
 {'line_no': 2900, 'line_size': 4},
 {'line_no': 2910, 'line_size': 3},
 {'line_no': 2930, 'line_size': 4},
 {'line_no': 2940, 'line_size': 1},
 {'line_no': 2950, 'line_size': 8},
 {'line_no': 2960, 'line_size': 4},
 {'line_no': 2970, 'line_size': 2},
 {'line_no': 2980, 'line_size': 7}]

### Inference

Here we observe that rows after 2820 (282 data points) have lots of errors. This can also be seen by manually checking the data file as well. So we only consider until line 2820 in the data file.

### Hungarian Dataset

In [6]:
# Opens hungarian.data, iterates lines, takes in data and stores as a list of rows

hungarian_datafile = open('raw_data/hungarian.data', 'r', errors = 'ignore')

hungarian_wrong_data_lines = list() # Stores datalines with incorrect size of elements
hungarian_wrong_name_lines = list() # Stores datalines with wrong name attribute

i = 1

for line in hungarian_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            hungarian_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            hungarian_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            hungarian_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            hungarian_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1

print(i)

hungarian_datafile.close()

2942


In [7]:
print("Lines with wrong number of elements are: ")
hungarian_wrong_data_lines

Lines with wrong number of elements are: 


[{'line_no': 2941, 'line_size': 0}]

In [8]:
print("Lines with wrong name attribute: ")
hungarian_wrong_name_lines

Lines with wrong name attribute: 


[]

### Inference

No errors found. 2940 Lines of Data is present which will account to 294 rows/datapoints

### Switzerland Dataset

In [9]:
# Opens switzerland.data, iterates lines, takes in data and stores as a list of rows

switzerland_datafile = open('raw_data/switzerland.data', 'r', errors = 'ignore')

switzerland_wrong_data_lines = list() # Stores datalines with incorrect size of elements
switzerland_wrong_name_lines = list() # Stores datalines with wrong name attribute

i = 1

for line in switzerland_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            switzerland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            switzerland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            switzerland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            switzerland_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1


print(i)

switzerland_datafile.close()

1231


In [10]:
print("Lines with wrong number of elements are: ")
switzerland_wrong_data_lines

Lines with wrong number of elements are: 


[]

In [11]:
print("Lines with wrong name attribute: ")
switzerland_wrong_name_lines

Lines with wrong name attribute: 


[]

### Inference

No errors found. 1230 Lines of Data is present which will account to 123 rows/datapoints

### Long Beach VA Dataset

In [12]:
# Opens long-beach-va.data, iterates lines, takes in data and stores as a list of rows

longbeach_datafile = open('raw_data/long-beach-va.data', 'r', errors = 'ignore')

longbeach_wrong_data_lines = list() # Stores datalines with incorrect size of elements
longbeach_wrong_name_lines = list() # Stores datalines with wrong name attribute

i = 1

for line in longbeach_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            longbeach_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            longbeach_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            longbeach_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            longbeach_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1


print(i)

longbeach_datafile.close()

2002


In [13]:
print("Lines with wrong number of elements are: ")
longbeach_wrong_data_lines

Lines with wrong number of elements are: 


[{'line_no': 2001, 'line_size': 0}]

In [14]:
print("Lines with wrong name attribute: ")
longbeach_wrong_name_lines

Lines with wrong name attribute: 


[]

### Inference

No errors found. 2000 Lines of Data is present which will account to 200 rows/datapoints

## Creating the Dataset

In [15]:
cleveland_datafile = open('raw_data/cleveland.data', 'r', errors = 'ignore')
hungarian_datafile = open('raw_data/hungarian.data', 'r', errors = 'ignore')
switzerland_datafile = open('raw_data/switzerland.data', 'r', errors = 'ignore')
longbeach_datafile = open('raw_data/long-beach-va.data', 'r', errors = 'ignore')

dataset = list()
datarow = list()

i = 0

for line in cleveland_datafile:
    i += 1
    if i > 2820:
        break
    linecontent = line.split()
    datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = datarow.copy()
        if len(row_clone)== 76:
            dataset.append(row_clone)
        else:
            print(row_clone)
        datarow.clear()

i = 0

for line in hungarian_datafile:
    i += 1
    if i > 2940:
        break
    linecontent = line.split()
    datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = datarow.copy()
        if len(row_clone)== 76:
            dataset.append(row_clone)
        else:
            print(row_clone)
        datarow.clear()

i = 0

for line in switzerland_datafile:
    i += 1
    if i > 1230:
        break
    linecontent = line.split()
    datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = datarow.copy()
        if len(row_clone)== 76:
            dataset.append(row_clone)
        else:
            print(row_clone)
        datarow.clear()
i = 0

for line in longbeach_datafile:
    i += 1
    if i > 2000:
        break
    linecontent = line.split()
    datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = datarow.copy()
        if len(row_clone)== 76:
            dataset.append(row_clone)
        else:
            print(row_clone)
        datarow.clear()
        
cleveland_datafile.close()
hungarian_datafile.close()
switzerland_datafile.close()
longbeach_datafile.close()

In [16]:
print("No of datapoints: ", len(dataset))

No of datapoints:  899


In [17]:
df = pd.DataFrame(dataset, columns=headers)
# The first version of the dataset is a direct raw conversion of the .data file to .csv file
df.to_csv('datasets/alldata_raw.csv', index=False)

In [18]:
del df

In [19]:
data = pd.read_csv('datasets/alldata_raw.csv')

In [20]:
data.head()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
0,1,0,63,1,-9,-9,-9,-9,1,145,...,1,1,1,1,1,1,1,-9.0,-9.0,name
1,2,0,67,1,-9,-9,-9,-9,4,160,...,1,1,1,1,1,1,1,-9.0,-9.0,name
2,3,0,67,1,-9,-9,-9,-9,4,120,...,2,2,1,1,1,7,3,-9.0,-9.0,name
3,4,0,37,1,-9,-9,-9,-9,3,130,...,1,1,1,1,1,1,1,-9.0,-9.0,name
4,6,0,41,0,-9,-9,-9,-9,2,130,...,1,1,1,1,1,1,1,-9.0,-9.0,name


In [21]:
data.describe()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,...,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0
mean,957.235818,0.0,53.480534,0.790879,-2.191324,-2.416018,-2.404894,-9.0,3.253615,122.840934,...,-5.338154,-1.476085,-1.883204,0.808676,0.820912,0.918799,1.387097,0.997775,3.669399,-7.031702
std,1204.015482,0.0,9.435894,0.406908,4.611051,4.472187,4.523955,0.0,0.928499,39.55893,...,4.848162,4.625416,4.675999,1.467955,1.501361,1.615995,2.288076,1.439189,25.498556,5.077432
min,1.0,0.0,28.0,0.0,-9.0,-9.0,-9.0,-9.0,1.0,-9.0,...,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0
25%,116.0,0.0,47.0,1.0,-9.0,-9.0,-9.0,-9.0,3.0,120.0,...,-9.0,-9.0,-9.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
50%,266.0,0.0,54.0,1.0,1.0,0.0,0.0,-9.0,4.0,130.0,...,-9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
75%,1207.5,0.0,60.0,1.0,1.0,1.0,1.0,-9.0,4.0,140.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7,-9.0
max,5002.0,0.0,77.0,1.0,1.0,1.0,1.0,-9.0,4.0,200.0,...,2.0,2.0,2.0,7.0,10.0,8.0,8.0,5.0,86.0,11.3


In [22]:
data = data.replace(-9, np.nan)
data.describe()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,899.0,899.0,899.0,899.0,617.0,617.0,613.0,0.0,899.0,840.0,...,327.0,654.0,629.0,880.0,880.0,880.0,880.0,883.0,311.0,119.0
mean,957.235818,0.0,53.480534,0.790879,0.920583,0.593193,0.672104,,3.253615,132.10119,...,1.067278,1.342508,1.171701,1.020455,1.032955,1.132955,1.611364,1.178935,27.623119,5.869748
std,1204.015482,0.0,9.435894,0.406908,0.270607,0.491637,0.46983,,0.928499,19.151127,...,0.250887,0.474912,0.377421,0.277384,0.415902,0.703837,1.722199,0.512572,31.675295,1.650914
min,1.0,0.0,28.0,0.0,0.0,0.0,0.0,,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.22,3.3
25%,116.0,0.0,47.0,1.0,1.0,0.0,0.0,,3.0,120.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.685,4.8
50%,266.0,0.0,54.0,1.0,1.0,1.0,1.0,,4.0,130.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.82,5.6
75%,1207.5,0.0,60.0,1.0,1.0,1.0,1.0,,4.0,140.0,...,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,63.0,6.9
max,5002.0,0.0,77.0,1.0,1.0,1.0,1.0,,4.0,200.0,...,2.0,2.0,2.0,7.0,10.0,8.0,8.0,5.0,86.0,11.3


In [23]:
# After replacing -9 with nan, a alldata_nan.csv is created
data.to_csv('datasets/alldata_nan.csv', index=False)

# Removing ID and Name

Now, we have the complete dataset as a dataframe. 

The 'name' field and 'id' field provide no practical use and should be dropped

In [24]:
data_1 = data.drop(['id', 'name'], axis = 1)

In [25]:
data_1.describe()

Unnamed: 0,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,899.0,899.0,899.0,617.0,617.0,613.0,0.0,899.0,840.0,865.0,...,327.0,654.0,629.0,880.0,880.0,880.0,880.0,883.0,311.0,119.0
mean,0.0,53.480534,0.790879,0.920583,0.593193,0.672104,,3.253615,132.10119,0.476301,...,1.067278,1.342508,1.171701,1.020455,1.032955,1.132955,1.611364,1.178935,27.623119,5.869748
std,0.0,9.435894,0.406908,0.270607,0.491637,0.46983,,0.928499,19.151127,0.499727,...,0.250887,0.474912,0.377421,0.277384,0.415902,0.703837,1.722199,0.512572,31.675295,1.650914
min,0.0,28.0,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.22,3.3
25%,0.0,47.0,1.0,1.0,0.0,0.0,,3.0,120.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.685,4.8
50%,0.0,54.0,1.0,1.0,1.0,1.0,,4.0,130.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.82,5.6
75%,0.0,60.0,1.0,1.0,1.0,1.0,,4.0,140.0,1.0,...,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,63.0,6.9
max,0.0,77.0,1.0,1.0,1.0,1.0,,4.0,200.0,1.0,...,2.0,2.0,2.0,7.0,10.0,8.0,8.0,5.0,86.0,11.3


In [26]:
data_1.head()

Unnamed: 0,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
0,0,63,1,,,,,1,145.0,1.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
1,0,67,1,,,,,4,160.0,1.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
2,0,67,1,,,,,4,120.0,1.0,...,,2.0,2.0,1.0,1.0,1.0,7.0,3.0,,
3,0,37,1,,,,,3,130.0,0.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
4,0,41,0,,,,,2,130.0,1.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,


In [27]:
# alldata_no_id.csv contains all attributes except name and id
data_1.to_csv('datasets/alldata_no_id.csv', index=False)

The dataset containing no id or name is saved as 'alldata_no_id.csv'

# Removing attributes with constant value

We can drop attributes which has the same value in all rows. So we find the attributes which have the same value throughout. We also drop columns with all NaN values

In [28]:
drop_cols = list()

for col in data_1.columns.values:
    if data_1[col].std() == 0:
        drop_cols.append(col)
        
print("Columns to be dropped: ", drop_cols)
print("No of rows to be dropped: ", len(drop_cols))

Columns to be dropped:  ['ccf']
No of rows to be dropped:  1


In [29]:
for col in data_1.columns.values:
    if np.isnan(data_1[col].std()):
        drop_cols.append(col)
        
print(drop_cols)
print("No of rows to be dropped: ", len(drop_cols))

['ccf', 'pncaden', 'restckm', 'exerckm', 'earlobe']
No of rows to be dropped:  5


In [30]:
data_prefinal = data_1.drop(drop_cols, axis=1)

In [31]:
data_prefinal.head()

Unnamed: 0,age,sex,painloc,painexer,relrest,cp,trestbps,htn,chol,smoke,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
0,63,1,,,,1,145.0,1.0,233.0,,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
1,67,1,,,,4,160.0,1.0,286.0,,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
2,67,1,,,,4,120.0,1.0,229.0,,...,,2.0,2.0,1.0,1.0,1.0,7.0,3.0,,
3,37,1,,,,3,130.0,0.0,250.0,,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
4,41,0,,,,2,130.0,1.0,204.0,,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,


In [32]:
data_prefinal.describe()

Unnamed: 0,age,sex,painloc,painexer,relrest,cp,trestbps,htn,chol,smoke,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,899.0,899.0,617.0,617.0,613.0,899.0,840.0,865.0,869.0,230.0,...,327.0,654.0,629.0,880.0,880.0,880.0,880.0,883.0,311.0,119.0
mean,53.480534,0.790879,0.920583,0.593193,0.672104,3.253615,132.10119,0.476301,198.759494,0.517391,...,1.067278,1.342508,1.171701,1.020455,1.032955,1.132955,1.611364,1.178935,27.623119,5.869748
std,9.435894,0.406908,0.270607,0.491637,0.46983,0.928499,19.151127,0.499727,111.834415,0.500787,...,0.250887,0.474912,0.377421,0.277384,0.415902,0.703837,1.722199,0.512572,31.675295,1.650914
min,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.22,3.3
25%,47.0,1.0,1.0,0.0,0.0,3.0,120.0,0.0,175.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.685,4.8
50%,54.0,1.0,1.0,1.0,1.0,4.0,130.0,0.0,224.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.82,5.6
75%,60.0,1.0,1.0,1.0,1.0,4.0,140.0,1.0,269.0,1.0,...,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,63.0,6.9
max,77.0,1.0,1.0,1.0,1.0,4.0,200.0,1.0,603.0,1.0,...,2.0,2.0,2.0,7.0,10.0,8.0,8.0,5.0,86.0,11.3


# Creating a dataset of only selected 14 attributes

Usually only 14 attributes are used for data analysis of this particular dataset. For general testing, this is also taken and considered.

In [33]:
data_selected = data_prefinal[selected_columns]

In [34]:
data_selected.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67,1,4,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67,1,4,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37,1,3,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41,0,2,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [35]:
data_selected.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,899.0,899.0,899.0,840.0,869.0,809.0,897.0,844.0,844.0,837.0,591.0,291.0,422.0,899.0
mean,53.480534,0.790879,3.253615,132.10119,198.759494,0.166873,0.603122,137.298578,0.390995,0.87049,1.766497,0.697595,5.018957,1.129032
std,9.435894,0.406908,0.928499,19.151127,111.834415,0.373093,0.803669,25.965959,0.488263,1.080548,0.621339,1.052728,1.949388,1.25972
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0,1.0,0.0
25%,47.0,1.0,3.0,120.0,175.0,0.0,0.0,120.0,0.0,0.0,1.0,0.0,3.0,0.0
50%,54.0,1.0,4.0,130.0,224.0,0.0,0.0,140.0,0.0,0.5,2.0,0.0,6.0,1.0
75%,60.0,1.0,4.0,140.0,269.0,0.0,1.0,157.0,1.0,1.5,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,9.0,7.0,4.0


In [36]:
cols_with_nan = data_selected.columns.values[data_selected.isna().any()]
display(cols_with_nan)

array(['trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
       'oldpeak', 'slope', 'ca', 'thal'], dtype=object)

In [37]:
data_selected[cols_with_nan].describe()

Unnamed: 0,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
count,840.0,869.0,809.0,897.0,844.0,844.0,837.0,591.0,291.0,422.0
mean,132.10119,198.759494,0.166873,0.603122,137.298578,0.390995,0.87049,1.766497,0.697595,5.018957
std,19.151127,111.834415,0.373093,0.803669,25.965959,0.488263,1.080548,0.621339,1.052728,1.949388
min,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0,1.0
25%,120.0,175.0,0.0,0.0,120.0,0.0,0.0,1.0,0.0,3.0
50%,130.0,224.0,0.0,0.0,140.0,0.0,0.5,2.0,0.0,6.0
75%,140.0,269.0,0.0,1.0,157.0,1.0,1.5,2.0,1.0,7.0
max,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,9.0,7.0


In [38]:
nan_count = dict()

for col in cols_with_nan:
    nan_count[col] = data_selected[col].isna().sum()
    print('Column Name: ', col)
    print('NaN Count: ', nan_count[col])
    print(data_selected[col].value_counts())
    print("\n")

Column Name:  trestbps
NaN Count:  59
120.0    128
130.0    112
140.0    100
110.0     58
150.0     56
160.0     50
125.0     28
115.0     19
135.0     18
145.0     16
128.0     16
100.0     15
112.0     14
138.0     14
170.0     13
180.0     12
124.0     11
132.0     11
122.0     11
118.0     10
142.0      9
134.0      9
105.0      9
136.0      8
155.0      8
108.0      7
126.0      7
95.0       6
152.0      6
144.0      5
158.0      4
200.0      4
102.0      3
104.0      3
154.0      3
106.0      3
178.0      3
146.0      3
190.0      2
116.0      2
156.0      2
165.0      2
172.0      2
94.0       2
123.0      1
117.0      1
114.0      1
96.0       1
0.0        1
185.0      1
80.0       1
98.0       1
129.0      1
92.0       1
113.0      1
101.0      1
174.0      1
148.0      1
192.0      1
127.0      1
Name: trestbps, dtype: int64


Column Name:  chol
NaN Count:  30
0.0      172
254.0     10
211.0      9
220.0      9
204.0      9
        ... 
365.0      1
518.0      1
468.0      1


In [39]:
print("No of rows with nan : ", sum([True for index,row in data_selected.iterrows() if any(row.isnull())]))

No of rows with nan :  619


## Filling NaN with Other Values

Here we fill NaN with other values in the column using probability of finding the value in the column as the criteria

In [40]:
data_selected_mean = data_selected.copy()
data_selected_ratio = data_selected.copy()
data_selected_mode = data_selected.copy()
data_selected_median = data_selected.copy()

## Imputing with proportion of each value in dataset

In [41]:
cols_with_nan = data_selected_ratio.columns.values[data_selected_ratio.isna().any()]

for col in cols_with_nan:
    print("Col Name: ", col, "")
    
    col_duplicate = data_selected_ratio[col].copy()
    
    for index, val in col_duplicate.iteritems():
        if np.isnan(val):
            print(index, val)
        
    col_probabilities = []
        
    for index, val in col_duplicate.value_counts().iteritems():
        col_probabilities.append(round(val/col_duplicate.value_counts().sum(), 2))
    
    for i in range(len(col_probabilities)):
        if round(sum(col_probabilities), 2) >= 1:
            break
        col_probabilities[i] += 0.01
    
    for i in range(len(col_probabilities)):
        if round(sum(col_probabilities), 2) <= 1:
            break
        col_probabilities[i] -= 0.01
    
    col_values = col_duplicate.value_counts().index.values.tolist()
        
    col_duplicate = col_duplicate.fillna(pd.Series(np.random.choice(col_values, 
                                                                      p=col_probabilities, size=len(col_duplicate))))
        
    for index, val in data_selected_ratio[col].iteritems():
        if col_duplicate[index] != val:
            print("Index: ", index, ", ", col, " duplicate value: ", col_duplicate[index], ", original value: ", val)
            
    data_selected_ratio[col] = col_duplicate.copy()
    
    print(col, "done!!!\n")

Col Name:  trestbps 
571 nan
631 nan
688 nan
712 nan
722 nan
725 nan
728 nan
729 nan
735 nan
737 nan
741 nan
743 nan
747 nan
750 nan
756 nan
757 nan
759 nan
767 nan
768 nan
771 nan
776 nan
786 nan
788 nan
793 nan
795 nan
808 nan
809 nan
810 nan
811 nan
812 nan
813 nan
814 nan
818 nan
819 nan
823 nan
828 nan
832 nan
835 nan
836 nan
840 nan
841 nan
843 nan
844 nan
846 nan
848 nan
854 nan
857 nan
858 nan
859 nan
860 nan
862 nan
863 nan
866 nan
867 nan
873 nan
880 nan
884 nan
895 nan
897 nan
Index:  571 ,  trestbps  duplicate value:  135.0 , original value:  nan
Index:  631 ,  trestbps  duplicate value:  160.0 , original value:  nan
Index:  688 ,  trestbps  duplicate value:  110.0 , original value:  nan
Index:  712 ,  trestbps  duplicate value:  120.0 , original value:  nan
Index:  722 ,  trestbps  duplicate value:  120.0 , original value:  nan
Index:  725 ,  trestbps  duplicate value:  170.0 , original value:  nan
Index:  728 ,  trestbps  duplicate value:  130.0 , original value:  nan
Ind

Index:  591 ,  fbs  duplicate value:  1.0 , original value:  nan
Index:  592 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  593 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  594 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  595 ,  fbs  duplicate value:  1.0 , original value:  nan
Index:  596 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  597 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  598 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  599 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  600 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  601 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  602 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  603 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  604 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  605 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  606 ,  fbs  dupli

Index:  771 ,  exang  duplicate value:  1.0 , original value:  nan
Index:  776 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  786 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  788 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  793 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  795 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  808 ,  exang  duplicate value:  1.0 , original value:  nan
Index:  809 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  810 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  811 ,  exang  duplicate value:  1.0 , original value:  nan
Index:  812 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  813 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  814 ,  exang  duplicate value:  1.0 , original value:  nan
Index:  818 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  819 ,  exang  duplicate value:  1.0 , original value: 

455 nan
456 nan
460 nan
461 nan
462 nan
464 nan
467 nan
468 nan
469 nan
473 nan
474 nan
475 nan
476 nan
477 nan
478 nan
480 nan
481 nan
483 nan
484 nan
485 nan
486 nan
487 nan
488 nan
489 nan
490 nan
491 nan
492 nan
493 nan
494 nan
496 nan
498 nan
499 nan
500 nan
501 nan
502 nan
503 nan
505 nan
506 nan
507 nan
508 nan
509 nan
511 nan
512 nan
513 nan
514 nan
515 nan
516 nan
517 nan
520 nan
523 nan
526 nan
529 nan
533 nan
535 nan
538 nan
539 nan
540 nan
542 nan
543 nan
545 nan
546 nan
548 nan
550 nan
553 nan
554 nan
556 nan
557 nan
558 nan
562 nan
563 nan
566 nan
567 nan
568 nan
569 nan
570 nan
572 nan
573 nan
575 nan
611 nan
631 nan
660 nan
676 nan
678 nan
679 nan
680 nan
682 nan
683 nan
684 nan
685 nan
687 nan
688 nan
689 nan
691 nan
692 nan
698 nan
700 nan
705 nan
709 nan
712 nan
714 nan
718 nan
721 nan
722 nan
725 nan
728 nan
729 nan
735 nan
737 nan
738 nan
740 nan
742 nan
743 nan
744 nan
746 nan
748 nan
750 nan
753 nan
756 nan
757 nan
758 nan
759 nan
762 nan
767 nan
768 nan
769 nan


Index:  526 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  529 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  533 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  535 ,  slope  duplicate value:  1.0 , original value:  nan
Index:  538 ,  slope  duplicate value:  1.0 , original value:  nan
Index:  539 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  540 ,  slope  duplicate value:  1.0 , original value:  nan
Index:  542 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  543 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  545 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  546 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  548 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  550 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  553 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  554 ,  slope  duplicate value:  2.0 , original value: 

621 nan
622 nan
623 nan
624 nan
625 nan
626 nan
627 nan
628 nan
629 nan
630 nan
631 nan
632 nan
633 nan
634 nan
635 nan
636 nan
637 nan
638 nan
639 nan
640 nan
641 nan
642 nan
643 nan
644 nan
645 nan
646 nan
647 nan
648 nan
649 nan
650 nan
651 nan
652 nan
653 nan
654 nan
655 nan
656 nan
657 nan
658 nan
659 nan
660 nan
661 nan
662 nan
663 nan
664 nan
665 nan
666 nan
667 nan
668 nan
669 nan
670 nan
671 nan
672 nan
673 nan
674 nan
675 nan
676 nan
677 nan
678 nan
679 nan
680 nan
681 nan
682 nan
683 nan
684 nan
685 nan
686 nan
687 nan
688 nan
689 nan
690 nan
691 nan
692 nan
693 nan
694 nan
695 nan
696 nan
697 nan
698 nan
699 nan
700 nan
701 nan
702 nan
703 nan
704 nan
705 nan
706 nan
707 nan
708 nan
709 nan
710 nan
711 nan
712 nan
713 nan
714 nan
715 nan
716 nan
717 nan
718 nan
719 nan
720 nan
721 nan
722 nan
723 nan
724 nan
725 nan
726 nan
728 nan
729 nan
730 nan
731 nan
732 nan
733 nan
734 nan
735 nan
736 nan
737 nan
739 nan
740 nan
741 nan
742 nan
743 nan
744 nan
745 nan
746 nan
747 nan


Index:  423 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  424 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  425 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  426 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  427 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  428 ,  ca  duplicate value:  3.0 , original value:  nan
Index:  429 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  431 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  432 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  433 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  434 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  435 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  436 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  437 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  438 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  439 ,  ca  duplicate value:  0.0

Index:  659 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  660 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  661 ,  ca  duplicate value:  3.0 , original value:  nan
Index:  662 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  663 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  664 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  665 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  666 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  667 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  668 ,  ca  duplicate value:  2.0 , original value:  nan
Index:  669 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  670 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  671 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  672 ,  ca  duplicate value:  2.0 , original value:  nan
Index:  673 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  674 ,  ca  duplicate value:  2.0

Index:  890 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  891 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  892 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  893 ,  ca  duplicate value:  2.0 , original value:  nan
Index:  894 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  895 ,  ca  duplicate value:  3.0 , original value:  nan
Index:  896 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  897 ,  ca  duplicate value:  3.0 , original value:  nan
Index:  898 ,  ca  duplicate value:  0.0 , original value:  nan
ca done!!!

Col Name:  thal 
87 nan
266 nan
282 nan
283 nan
284 nan
285 nan
286 nan
287 nan
288 nan
289 nan
290 nan
291 nan
292 nan
293 nan
294 nan
295 nan
296 nan
297 nan
298 nan
299 nan
300 nan
301 nan
302 nan
303 nan
304 nan
305 nan
306 nan
307 nan
308 nan
309 nan
310 nan
311 nan
312 nan
313 nan
314 nan
315 nan
316 nan
317 nan
318 nan
319 nan
320 nan
321 nan
322 nan
323 nan
324 nan
325 nan
334 nan
335 nan
336 nan
337 

Index:  396 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  397 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  398 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  399 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  400 ,  thal  duplicate value:  1.0 , original value:  nan
Index:  401 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  402 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  403 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  406 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  407 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  408 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  409 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  410 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  411 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  412 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  41

Index:  625 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  669 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  699 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  700 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  701 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  702 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  703 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  705 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  706 ,  thal  duplicate value:  1.0 , original value:  nan
Index:  707 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  708 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  709 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  710 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  712 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  713 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  71

In [42]:
print("No of rows with nan : ", sum([True for index,row in data_selected_ratio.iterrows() if any(row.isnull())]))

No of rows with nan :  0


In [43]:
data_selected_ratio.to_csv('datasets/alldata_short.csv', index=False)

## Replacing with Mean

In [44]:
for col in cols_with_nan:
    col_mean_value = data_selected_mean[col].mean()
    print("Mean of", col, "attribute:", col_mean_value)
    data_selected_mean[col].fillna(value = col_mean_value, inplace = True)
    comparison_array = np.where(data_selected_mean[col] == data_selected[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mean of trestbps attribute: 132.10119047619048
No of changes in col: trestbps :  59
Mean of chol attribute: 198.7594936708861
No of changes in col: chol :  30
Mean of fbs attribute: 0.1668726823238566
No of changes in col: fbs :  90
Mean of restecg attribute: 0.6031215161649944
No of changes in col: restecg :  2
Mean of thalach attribute: 137.29857819905214
No of changes in col: thalach :  55
Mean of exang attribute: 0.3909952606635071
No of changes in col: exang :  55
Mean of oldpeak attribute: 0.8704898446833931
No of changes in col: oldpeak :  62
Mean of slope attribute: 1.766497461928934
No of changes in col: slope :  308
Mean of ca attribute: 0.697594501718213
No of changes in col: ca :  608
Mean of thal attribute: 5.018957345971564
No of changes in col: thal :  477


In [45]:
# data_selected_mean.to_csv('datasets/alldata_short.csv', index=False)

## Replacing with Mode

In [46]:
for col in cols_with_nan:
    col_mode_value = data_selected_mode[col].mode()
    print("Mode of", col, "attribute:", col_mode_value)
    data_selected_mode[col].fillna(value = col_mode_value, inplace = True)
    comparison_array = np.where(data_selected_mode[col] == data_selected[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of trestbps attribute: 0    120.0
dtype: float64
No of changes in col: trestbps :  59
Mode of chol attribute: 0    0.0
dtype: float64
No of changes in col: chol :  30
Mode of fbs attribute: 0    0.0
dtype: float64
No of changes in col: fbs :  90
Mode of restecg attribute: 0    0.0
dtype: float64
No of changes in col: restecg :  2
Mode of thalach attribute: 0    150.0
dtype: float64
No of changes in col: thalach :  55
Mode of exang attribute: 0    0.0
dtype: float64
No of changes in col: exang :  55
Mode of oldpeak attribute: 0    0.0
dtype: float64
No of changes in col: oldpeak :  62
Mode of slope attribute: 0    2.0
dtype: float64
No of changes in col: slope :  308
Mode of ca attribute: 0    0.0
dtype: float64
No of changes in col: ca :  608
Mode of thal attribute: 0    3.0
dtype: float64
No of changes in col: thal :  477


In [47]:
# data_selected_mode.to_csv('datasets/alldata_short.csv', index=False)

## Replacing with Median

In [48]:
for col in cols_with_nan:
    col_median_value = data_selected_median[col].median()
    print("Median of", col, "attribute:", col_median_value)
    data_selected_median[col].fillna(value = col_median_value, inplace = True)
    comparison_array = np.where(data_selected_median[col] == data_selected[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Median of trestbps attribute: 130.0
No of changes in col: trestbps :  59
Median of chol attribute: 224.0
No of changes in col: chol :  30
Median of fbs attribute: 0.0
No of changes in col: fbs :  90
Median of restecg attribute: 0.0
No of changes in col: restecg :  2
Median of thalach attribute: 140.0
No of changes in col: thalach :  55
Median of exang attribute: 0.0
No of changes in col: exang :  55
Median of oldpeak attribute: 0.5
No of changes in col: oldpeak :  62
Median of slope attribute: 2.0
No of changes in col: slope :  308
Median of ca attribute: 0.0
No of changes in col: ca :  608
Median of thal attribute: 6.0
No of changes in col: thal :  477


In [49]:
# data_selected_median.to_csv('datasets/alldata_short.csv', index=False)

## Processing High Dimensional Data with NaN values 

Considering for NaN values and figuring out which columns to remove and which to keep

In [50]:
cols_with_nan_large = data_prefinal.columns.values[data_prefinal.isna().any()]
display(cols_with_nan_large)

array(['painloc', 'painexer', 'relrest', 'trestbps', 'htn', 'chol',
       'smoke', 'cigs', 'years', 'fbs', 'dm', 'famhist', 'restecg',
       'ekgmo', 'ekgday', 'ekgyr', 'dig', 'prop', 'nitr', 'pro',
       'diuretic', 'proto', 'thaldur', 'thaltime', 'met', 'thalach',
       'thalrest', 'tpeakbps', 'tpeakbpd', 'dummy', 'trestbpd', 'exang',
       'xhypo', 'oldpeak', 'slope', 'rldv5', 'rldv5e', 'ca', 'restef',
       'restwm', 'exeref', 'exerwm', 'thal', 'thalsev', 'thalpul', 'cmo',
       'cday', 'cyr', 'lmt', 'ladprox', 'laddist', 'diag', 'cxmain',
       'ramus', 'om1', 'om2', 'rcaprox', 'rcadist', 'lvx1', 'lvx2',
       'lvx3', 'lvx4', 'lvf', 'cathef', 'junk'], dtype=object)

In [51]:
data_prefinal[cols_with_nan_large].describe()

Unnamed: 0,painloc,painexer,relrest,trestbps,htn,chol,smoke,cigs,years,fbs,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,617.0,617.0,613.0,840.0,865.0,869.0,230.0,479.0,467.0,809.0,...,327.0,654.0,629.0,880.0,880.0,880.0,880.0,883.0,311.0,119.0
mean,0.920583,0.593193,0.672104,132.10119,0.476301,198.759494,0.517391,19.118998,18.796574,0.166873,...,1.067278,1.342508,1.171701,1.020455,1.032955,1.132955,1.611364,1.178935,27.623119,5.869748
std,0.270607,0.491637,0.46983,19.151127,0.499727,111.834415,0.500787,18.296273,16.359145,0.373093,...,0.250887,0.474912,0.377421,0.277384,0.415902,0.703837,1.722199,0.512572,31.675295,1.650914
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.22,3.3
25%,1.0,0.0,0.0,120.0,0.0,175.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.685,4.8
50%,1.0,1.0,1.0,130.0,0.0,224.0,1.0,20.0,20.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.82,5.6
75%,1.0,1.0,1.0,140.0,1.0,269.0,1.0,30.0,30.0,0.0,...,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,63.0,6.9
max,1.0,1.0,1.0,200.0,1.0,603.0,1.0,99.0,60.0,1.0,...,2.0,2.0,2.0,7.0,10.0,8.0,8.0,5.0,86.0,11.3


In [52]:
nan_count_large = dict()

for col in cols_with_nan_large:
    nan_count_large[col] = data_prefinal[col].isna().sum()
    print('Column Name: ', col)
    print('NaN Count: ', nan_count_large[col])
    print(data_prefinal[col].value_counts())
    print("\n")

Column Name:  painloc
NaN Count:  282
1.0    568
0.0     49
Name: painloc, dtype: int64


Column Name:  painexer
NaN Count:  282
1.0    366
0.0    251
Name: painexer, dtype: int64


Column Name:  relrest
NaN Count:  286
1.0    412
0.0    201
Name: relrest, dtype: int64


Column Name:  trestbps
NaN Count:  59
120.0    128
130.0    112
140.0    100
110.0     58
150.0     56
160.0     50
125.0     28
115.0     19
135.0     18
145.0     16
128.0     16
100.0     15
112.0     14
138.0     14
170.0     13
180.0     12
124.0     11
132.0     11
122.0     11
118.0     10
142.0      9
134.0      9
105.0      9
136.0      8
155.0      8
108.0      7
126.0      7
95.0       6
152.0      6
144.0      5
158.0      4
200.0      4
102.0      3
104.0      3
154.0      3
106.0      3
178.0      3
146.0      3
190.0      2
116.0      2
156.0      2
165.0      2
172.0      2
94.0       2
123.0      1
117.0      1
114.0      1
96.0       1
0.0        1
185.0      1
80.0       1
98.0       1
129.0      1
9

In [53]:
cols_to_be_removed = list()

for col in cols_with_nan_large:
    if data_prefinal[col].isna().sum() > 0.25*data_prefinal.shape[0] and col not in selected_columns:
        cols_to_be_removed.append(col)

print(cols_to_be_removed)
print(len(cols_to_be_removed))

['painloc', 'painexer', 'relrest', 'smoke', 'cigs', 'years', 'dm', 'famhist', 'thaltime', 'rldv5', 'restef', 'restwm', 'exeref', 'exerwm', 'thalsev', 'thalpul', 'lmt', 'ladprox', 'laddist', 'diag', 'cxmain', 'ramus', 'om1', 'om2', 'rcaprox', 'rcadist', 'cathef', 'junk']
28


In [54]:
data_final = data_prefinal.drop(cols_to_be_removed, axis = 1)

In [55]:
data_final.describe()

Unnamed: 0,age,sex,cp,trestbps,htn,chol,fbs,restecg,ekgmo,ekgday,...,thal,cmo,cday,cyr,num,lvx1,lvx2,lvx3,lvx4,lvf
count,899.0,899.0,899.0,840.0,865.0,869.0,809.0,897.0,846.0,845.0,...,422.0,888.0,890.0,890.0,899.0,880.0,880.0,880.0,880.0,883.0
mean,53.480534,0.790879,3.253615,132.10119,0.476301,198.759494,0.166873,0.603122,5.973995,15.493491,...,5.018957,6.122748,15.988764,83.839326,1.129032,1.020455,1.032955,1.132955,1.611364,1.178935
std,9.435894,0.406908,0.928499,19.151127,0.499727,111.834415,0.373093,0.803669,3.486479,8.761939,...,1.949388,3.474114,8.860872,4.407533,1.25972,0.277384,0.415902,0.703837,1.722199,0.512572
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
25%,47.0,1.0,3.0,120.0,0.0,175.0,0.0,0.0,3.0,8.0,...,3.0,3.0,8.0,83.0,0.0,1.0,1.0,1.0,1.0,1.0
50%,54.0,1.0,4.0,130.0,0.0,224.0,0.0,0.0,6.0,16.0,...,6.0,6.0,16.0,84.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,60.0,1.0,4.0,140.0,1.0,269.0,0.0,1.0,9.0,23.0,...,7.0,9.0,23.75,85.0,2.0,1.0,1.0,1.0,1.0,1.0
max,77.0,1.0,4.0,200.0,1.0,603.0,1.0,2.0,12.0,31.0,...,7.0,12.0,31.0,87.0,4.0,7.0,10.0,8.0,8.0,5.0


In [56]:
print("No of rows with nan : ", sum([True for index,row in data_final.iterrows() if any(row.isnull())]))

No of rows with nan :  621


In [57]:
data_final_ratio = data_final.copy()
data_final_mean = data_final.copy()
data_final_mode = data_final.copy()
data_final_median = data_final.copy()

## Imputing with proportion of each value in dataset

In [58]:
cols_with_nan_large = data_final_ratio.columns.values[data_final_ratio.isna().any()]

for col in cols_with_nan_large:
    print("Col Name: ", col, "")
    
    col_duplicate = data_final_ratio[col].copy()
    
    for index, val in col_duplicate.iteritems():
        if np.isnan(val):
            print(index, val)
        
    col_probabilities = []
        
    for index, val in col_duplicate.value_counts().iteritems():
        col_probabilities.append(round(val/col_duplicate.value_counts().sum(), 2))
    
    for i in range(len(col_probabilities)):
        if round(sum(col_probabilities), 2) >= 1:
            break
        col_probabilities[i] += 0.01
        
    for i in range(len(col_probabilities)):
        if round(sum(col_probabilities), 2) <= 1:
            break
        col_probabilities[i] -= 0.01 
    
    col_values = col_duplicate.value_counts().index.values.tolist()
        
    col_duplicate = col_duplicate.fillna(pd.Series(np.random.choice(col_values, 
                                                                      p=col_probabilities, size=len(col_duplicate))))
        
    for index, val in data_final_ratio[col].iteritems():
        if col_duplicate[index] != val:
            print("Index: ", index, ", ", col, " duplicate value: ", col_duplicate[index], ", original value: ", val)
            
    data_final_ratio[col] = col_duplicate.copy()
    
    print(col, "done!!!\n")

Col Name:  trestbps 
571 nan
631 nan
688 nan
712 nan
722 nan
725 nan
728 nan
729 nan
735 nan
737 nan
741 nan
743 nan
747 nan
750 nan
756 nan
757 nan
759 nan
767 nan
768 nan
771 nan
776 nan
786 nan
788 nan
793 nan
795 nan
808 nan
809 nan
810 nan
811 nan
812 nan
813 nan
814 nan
818 nan
819 nan
823 nan
828 nan
832 nan
835 nan
836 nan
840 nan
841 nan
843 nan
844 nan
846 nan
848 nan
854 nan
857 nan
858 nan
859 nan
860 nan
862 nan
863 nan
866 nan
867 nan
873 nan
880 nan
884 nan
895 nan
897 nan
Index:  571 ,  trestbps  duplicate value:  128.0 , original value:  nan
Index:  631 ,  trestbps  duplicate value:  110.0 , original value:  nan
Index:  688 ,  trestbps  duplicate value:  140.0 , original value:  nan
Index:  712 ,  trestbps  duplicate value:  130.0 , original value:  nan
Index:  722 ,  trestbps  duplicate value:  170.0 , original value:  nan
Index:  725 ,  trestbps  duplicate value:  120.0 , original value:  nan
Index:  728 ,  trestbps  duplicate value:  135.0 , original value:  nan
Ind

Index:  818 ,  ekgmo  duplicate value:  3.0 , original value:  nan
Index:  819 ,  ekgmo  duplicate value:  3.0 , original value:  nan
Index:  823 ,  ekgmo  duplicate value:  12.0 , original value:  nan
Index:  828 ,  ekgmo  duplicate value:  11.0 , original value:  nan
Index:  832 ,  ekgmo  duplicate value:  12.0 , original value:  nan
Index:  835 ,  ekgmo  duplicate value:  10.0 , original value:  nan
Index:  836 ,  ekgmo  duplicate value:  12.0 , original value:  nan
Index:  840 ,  ekgmo  duplicate value:  4.0 , original value:  nan
Index:  841 ,  ekgmo  duplicate value:  3.0 , original value:  nan
Index:  843 ,  ekgmo  duplicate value:  3.0 , original value:  nan
Index:  844 ,  ekgmo  duplicate value:  4.0 , original value:  nan
Index:  846 ,  ekgmo  duplicate value:  4.0 , original value:  nan
Index:  848 ,  ekgmo  duplicate value:  5.0 , original value:  nan
Index:  854 ,  ekgmo  duplicate value:  8.0 , original value:  nan
Index:  857 ,  ekgmo  duplicate value:  5.0 , original va

Index:  725 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  728 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  729 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  735 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  737 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  741 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  743 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  750 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  756 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  757 ,  dig  duplicate value:  1.0 , original value:  nan
Index:  759 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  764 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  767 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  768 ,  dig  duplicate value:  1.0 , original value:  nan
Index:  770 ,  dig  duplicate value:  0.0 , original value:  nan
Index:  771 ,  dig  dupli

Index:  844 ,  nitr  duplicate value:  1.0 , original value:  nan
Index:  846 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  848 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  849 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  854 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  856 ,  nitr  duplicate value:  1.0 , original value:  nan
Index:  857 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  858 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  859 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  860 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  862 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  863 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  866 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  867 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  873 ,  nitr  duplicate value:  0.0 , original value:  nan
Index:  88

848 nan
854 nan
857 nan
858 nan
860 nan
862 nan
863 nan
866 nan
867 nan
873 nan
880 nan
884 nan
895 nan
897 nan
Index:  283 ,  proto  duplicate value:  1.0 , original value:  nan
Index:  342 ,  proto  duplicate value:  1.0 , original value:  nan
Index:  362 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  414 ,  proto  duplicate value:  125.0 , original value:  nan
Index:  436 ,  proto  duplicate value:  100.0 , original value:  nan
Index:  465 ,  proto  duplicate value:  1.0 , original value:  nan
Index:  468 ,  proto  duplicate value:  1.0 , original value:  nan
Index:  507 ,  proto  duplicate value:  50.0 , original value:  nan
Index:  546 ,  proto  duplicate value:  1.0 , original value:  nan
Index:  577 ,  proto  duplicate value:  1.0 , original value:  nan
Index:  578 ,  proto  duplicate value:  1.0 , original value:  nan
Index:  579 ,  proto  duplicate value:  1.0 , original value:  nan
Index:  580 ,  proto  duplicate value:  50.0 , original value:  nan
Index:  581

Index:  832 ,  thaldur  duplicate value:  10.5 , original value:  nan
Index:  835 ,  thaldur  duplicate value:  10.0 , original value:  nan
Index:  836 ,  thaldur  duplicate value:  13.0 , original value:  nan
Index:  840 ,  thaldur  duplicate value:  12.0 , original value:  nan
Index:  841 ,  thaldur  duplicate value:  6.0 , original value:  nan
Index:  843 ,  thaldur  duplicate value:  20.0 , original value:  nan
Index:  844 ,  thaldur  duplicate value:  9.0 , original value:  nan
Index:  846 ,  thaldur  duplicate value:  14.0 , original value:  nan
Index:  848 ,  thaldur  duplicate value:  4.5 , original value:  nan
Index:  854 ,  thaldur  duplicate value:  20.0 , original value:  nan
Index:  857 ,  thaldur  duplicate value:  6.0 , original value:  nan
Index:  858 ,  thaldur  duplicate value:  9.0 , original value:  nan
Index:  860 ,  thaldur  duplicate value:  12.0 , original value:  nan
Index:  862 ,  thaldur  duplicate value:  14.0 , original value:  nan
Index:  863 ,  thaldur  d

Index:  571 ,  thalach  duplicate value:  156.0 , original value:  nan
Index:  631 ,  thalach  duplicate value:  145.0 , original value:  nan
Index:  712 ,  thalach  duplicate value:  100.0 , original value:  nan
Index:  722 ,  thalach  duplicate value:  111.0 , original value:  nan
Index:  725 ,  thalach  duplicate value:  125.0 , original value:  nan
Index:  728 ,  thalach  duplicate value:  120.0 , original value:  nan
Index:  729 ,  thalach  duplicate value:  182.0 , original value:  nan
Index:  735 ,  thalach  duplicate value:  114.0 , original value:  nan
Index:  737 ,  thalach  duplicate value:  175.0 , original value:  nan
Index:  743 ,  thalach  duplicate value:  126.0 , original value:  nan
Index:  750 ,  thalach  duplicate value:  173.0 , original value:  nan
Index:  756 ,  thalach  duplicate value:  120.0 , original value:  nan
Index:  757 ,  thalach  duplicate value:  130.0 , original value:  nan
Index:  759 ,  thalach  duplicate value:  130.0 , original value:  nan
Index:

Index:  589 ,  tpeakbps  duplicate value:  165.0 , original value:  nan
Index:  631 ,  tpeakbps  duplicate value:  180.0 , original value:  nan
Index:  688 ,  tpeakbps  duplicate value:  190.0 , original value:  nan
Index:  712 ,  tpeakbps  duplicate value:  180.0 , original value:  nan
Index:  722 ,  tpeakbps  duplicate value:  145.0 , original value:  nan
Index:  725 ,  tpeakbps  duplicate value:  165.0 , original value:  nan
Index:  728 ,  tpeakbps  duplicate value:  160.0 , original value:  nan
Index:  729 ,  tpeakbps  duplicate value:  140.0 , original value:  nan
Index:  735 ,  tpeakbps  duplicate value:  220.0 , original value:  nan
Index:  737 ,  tpeakbps  duplicate value:  180.0 , original value:  nan
Index:  741 ,  tpeakbps  duplicate value:  152.0 , original value:  nan
Index:  743 ,  tpeakbps  duplicate value:  190.0 , original value:  nan
Index:  750 ,  tpeakbps  duplicate value:  215.0 , original value:  nan
Index:  756 ,  tpeakbps  duplicate value:  150.0 , original valu

Index:  741 ,  dummy  duplicate value:  150.0 , original value:  nan
Index:  743 ,  dummy  duplicate value:  112.0 , original value:  nan
Index:  747 ,  dummy  duplicate value:  110.0 , original value:  nan
Index:  750 ,  dummy  duplicate value:  130.0 , original value:  nan
Index:  756 ,  dummy  duplicate value:  140.0 , original value:  nan
Index:  757 ,  dummy  duplicate value:  122.0 , original value:  nan
Index:  759 ,  dummy  duplicate value:  132.0 , original value:  nan
Index:  767 ,  dummy  duplicate value:  150.0 , original value:  nan
Index:  768 ,  dummy  duplicate value:  140.0 , original value:  nan
Index:  771 ,  dummy  duplicate value:  160.0 , original value:  nan
Index:  776 ,  dummy  duplicate value:  95.0 , original value:  nan
Index:  786 ,  dummy  duplicate value:  160.0 , original value:  nan
Index:  788 ,  dummy  duplicate value:  130.0 , original value:  nan
Index:  793 ,  dummy  duplicate value:  110.0 , original value:  nan
Index:  795 ,  dummy  duplicate val

Index:  880 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  884 ,  exang  duplicate value:  0.0 , original value:  nan
Index:  895 ,  exang  duplicate value:  1.0 , original value:  nan
Index:  897 ,  exang  duplicate value:  1.0 , original value:  nan
exang done!!!

Col Name:  xhypo 
495 nan
571 nan
590 nan
623 nan
631 nan
712 nan
722 nan
725 nan
728 nan
729 nan
735 nan
737 nan
743 nan
750 nan
756 nan
757 nan
759 nan
767 nan
768 nan
771 nan
776 nan
786 nan
788 nan
793 nan
795 nan
808 nan
809 nan
810 nan
811 nan
812 nan
813 nan
814 nan
818 nan
819 nan
823 nan
828 nan
832 nan
835 nan
836 nan
840 nan
841 nan
843 nan
844 nan
846 nan
848 nan
854 nan
857 nan
858 nan
860 nan
862 nan
863 nan
866 nan
867 nan
873 nan
880 nan
884 nan
895 nan
897 nan
Index:  495 ,  xhypo  duplicate value:  0.0 , original value:  nan
Index:  571 ,  xhypo  duplicate value:  0.0 , original value:  nan
Index:  590 ,  xhypo  duplicate value:  0.0 , original value:  nan
Index:  623 ,  xhypo  duplicate va

398 nan
400 nan
401 nan
402 nan
403 nan
404 nan
406 nan
407 nan
408 nan
410 nan
412 nan
413 nan
417 nan
418 nan
420 nan
422 nan
425 nan
427 nan
428 nan
429 nan
430 nan
432 nan
433 nan
434 nan
435 nan
436 nan
439 nan
442 nan
443 nan
444 nan
445 nan
446 nan
447 nan
450 nan
451 nan
452 nan
453 nan
454 nan
455 nan
456 nan
460 nan
461 nan
462 nan
464 nan
467 nan
468 nan
469 nan
473 nan
474 nan
475 nan
476 nan
477 nan
478 nan
480 nan
481 nan
483 nan
484 nan
485 nan
486 nan
487 nan
488 nan
489 nan
490 nan
491 nan
492 nan
493 nan
494 nan
496 nan
498 nan
499 nan
500 nan
501 nan
502 nan
503 nan
505 nan
506 nan
507 nan
508 nan
509 nan
511 nan
512 nan
513 nan
514 nan
515 nan
516 nan
517 nan
520 nan
523 nan
526 nan
529 nan
533 nan
535 nan
538 nan
539 nan
540 nan
542 nan
543 nan
545 nan
546 nan
548 nan
550 nan
553 nan
554 nan
556 nan
557 nan
558 nan
562 nan
563 nan
566 nan
567 nan
568 nan
569 nan
570 nan
572 nan
573 nan
575 nan
611 nan
631 nan
660 nan
676 nan
678 nan
679 nan
680 nan
682 nan
683 nan


Index:  539 ,  slope  duplicate value:  1.0 , original value:  nan
Index:  540 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  542 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  543 ,  slope  duplicate value:  3.0 , original value:  nan
Index:  545 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  546 ,  slope  duplicate value:  1.0 , original value:  nan
Index:  548 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  550 ,  slope  duplicate value:  1.0 , original value:  nan
Index:  553 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  554 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  556 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  557 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  558 ,  slope  duplicate value:  1.0 , original value:  nan
Index:  562 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  563 ,  slope  duplicate value:  2.0 , original value: 

Index:  576 ,  rldv5e  duplicate value:  30.0 , original value:  nan
Index:  610 ,  rldv5e  duplicate value:  15.0 , original value:  nan
Index:  611 ,  rldv5e  duplicate value:  6.0 , original value:  nan
Index:  616 ,  rldv5e  duplicate value:  14.0 , original value:  nan
Index:  626 ,  rldv5e  duplicate value:  115.0 , original value:  nan
Index:  627 ,  rldv5e  duplicate value:  10.0 , original value:  nan
Index:  628 ,  rldv5e  duplicate value:  14.0 , original value:  nan
Index:  629 ,  rldv5e  duplicate value:  16.0 , original value:  nan
Index:  630 ,  rldv5e  duplicate value:  120.0 , original value:  nan
Index:  631 ,  rldv5e  duplicate value:  190.0 , original value:  nan
Index:  632 ,  rldv5e  duplicate value:  9.0 , original value:  nan
Index:  633 ,  rldv5e  duplicate value:  176.0 , original value:  nan
Index:  634 ,  rldv5e  duplicate value:  20.0 , original value:  nan
Index:  635 ,  rldv5e  duplicate value:  18.0 , original value:  nan
Index:  636 ,  rldv5e  duplicate

432 nan
433 nan
434 nan
435 nan
436 nan
437 nan
438 nan
439 nan
440 nan
441 nan
442 nan
443 nan
444 nan
445 nan
446 nan
447 nan
448 nan
449 nan
450 nan
451 nan
452 nan
453 nan
454 nan
455 nan
456 nan
457 nan
458 nan
459 nan
460 nan
461 nan
462 nan
463 nan
464 nan
465 nan
466 nan
467 nan
468 nan
469 nan
470 nan
471 nan
472 nan
473 nan
474 nan
475 nan
476 nan
477 nan
478 nan
479 nan
480 nan
481 nan
482 nan
483 nan
484 nan
485 nan
486 nan
487 nan
488 nan
489 nan
490 nan
491 nan
492 nan
493 nan
494 nan
495 nan
496 nan
498 nan
499 nan
500 nan
501 nan
502 nan
503 nan
504 nan
505 nan
506 nan
507 nan
508 nan
509 nan
510 nan
511 nan
512 nan
513 nan
514 nan
515 nan
516 nan
517 nan
518 nan
519 nan
520 nan
521 nan
522 nan
523 nan
524 nan
525 nan
526 nan
527 nan
528 nan
529 nan
530 nan
531 nan
532 nan
533 nan
534 nan
535 nan
536 nan
537 nan
538 nan
539 nan
540 nan
541 nan
542 nan
543 nan
544 nan
545 nan
546 nan
547 nan
548 nan
549 nan
550 nan
551 nan
552 nan
553 nan
554 nan
555 nan
556 nan
557 nan


Index:  520 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  521 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  522 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  523 ,  ca  duplicate value:  2.0 , original value:  nan
Index:  524 ,  ca  duplicate value:  2.0 , original value:  nan
Index:  525 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  526 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  527 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  528 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  529 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  530 ,  ca  duplicate value:  3.0 , original value:  nan
Index:  531 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  532 ,  ca  duplicate value:  3.0 , original value:  nan
Index:  533 ,  ca  duplicate value:  2.0 , original value:  nan
Index:  534 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  535 ,  ca  duplicate value:  0.0

Index:  722 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  723 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  724 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  725 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  726 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  728 ,  ca  duplicate value:  3.0 , original value:  nan
Index:  729 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  730 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  731 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  732 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  733 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  734 ,  ca  duplicate value:  2.0 , original value:  nan
Index:  735 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  736 ,  ca  duplicate value:  0.0 , original value:  nan
Index:  737 ,  ca  duplicate value:  1.0 , original value:  nan
Index:  739 ,  ca  duplicate value:  1.0

397 nan
398 nan
399 nan
400 nan
401 nan
402 nan
403 nan
406 nan
407 nan
408 nan
409 nan
410 nan
411 nan
412 nan
413 nan
414 nan
415 nan
416 nan
417 nan
418 nan
419 nan
420 nan
421 nan
422 nan
423 nan
424 nan
425 nan
426 nan
427 nan
428 nan
429 nan
430 nan
431 nan
432 nan
433 nan
434 nan
435 nan
436 nan
437 nan
438 nan
439 nan
440 nan
441 nan
442 nan
443 nan
444 nan
445 nan
446 nan
447 nan
448 nan
449 nan
450 nan
451 nan
452 nan
453 nan
454 nan
455 nan
456 nan
457 nan
458 nan
459 nan
460 nan
461 nan
462 nan
463 nan
464 nan
465 nan
467 nan
468 nan
469 nan
470 nan
471 nan
472 nan
473 nan
474 nan
475 nan
476 nan
477 nan
478 nan
479 nan
480 nan
481 nan
482 nan
483 nan
484 nan
485 nan
486 nan
487 nan
488 nan
489 nan
490 nan
491 nan
492 nan
493 nan
500 nan
502 nan
503 nan
504 nan
505 nan
506 nan
508 nan
509 nan
510 nan
511 nan
512 nan
513 nan
514 nan
515 nan
516 nan
517 nan
518 nan
519 nan
520 nan
521 nan
522 nan
523 nan
524 nan
525 nan
526 nan
527 nan
528 nan
530 nan
531 nan
532 nan
533 nan


Index:  416 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  417 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  418 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  419 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  420 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  421 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  422 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  423 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  424 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  425 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  426 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  427 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  428 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  429 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  430 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  43

Index:  719 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  720 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  721 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  722 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  725 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  728 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  729 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  730 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  731 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  732 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  734 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  735 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  737 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  738 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  739 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  74

Index:  624 ,  cyr  duplicate value:  85.0 , original value:  nan
Index:  625 ,  cyr  duplicate value:  85.0 , original value:  nan
Index:  645 ,  cyr  duplicate value:  87.0 , original value:  nan
cyr done!!!

Col Name:  lvx1 
583 nan
589 nan
590 nan
593 nan
598 nan
612 nan
616 nan
617 nan
618 nan
620 nan
623 nan
624 nan
625 nan
636 nan
640 nan
665 nan
669 nan
823 nan
875 nan
Index:  583 ,  lvx1  duplicate value:  1.0 , original value:  nan
Index:  589 ,  lvx1  duplicate value:  1.0 , original value:  nan
Index:  590 ,  lvx1  duplicate value:  1.0 , original value:  nan
Index:  593 ,  lvx1  duplicate value:  1.0 , original value:  nan
Index:  598 ,  lvx1  duplicate value:  1.0 , original value:  nan
Index:  612 ,  lvx1  duplicate value:  1.0 , original value:  nan
Index:  616 ,  lvx1  duplicate value:  1.0 , original value:  nan
Index:  617 ,  lvx1  duplicate value:  1.0 , original value:  nan
Index:  618 ,  lvx1  duplicate value:  1.0 , original value:  nan
Index:  620 ,  lvx1  dupli

In [59]:
print("No of rows with nan : ", sum([True for index,row in data_final_ratio.iterrows() if any(row.isnull())]))

No of rows with nan :  0


In [60]:
data_final_ratio.to_csv('datasets/alldata_large.csv', index=False)

## Replacing with Mean

In [61]:
for col in cols_with_nan_large:
    col_mean_value = data_final_mean[col].mean()
    print("Mean of", col, "attribute:", col_mean_value)
    data_final_mean[col].fillna(value = col_mean_value, inplace = True)
    comparison_array = np.where(data_final_mean[col] == data_final[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mean of trestbps attribute: 132.10119047619048
No of changes in col: trestbps :  59
Mean of htn attribute: 0.47630057803468207
No of changes in col: htn :  34
Mean of chol attribute: 198.7594936708861
No of changes in col: chol :  30
Mean of fbs attribute: 0.1668726823238566
No of changes in col: fbs :  90
Mean of restecg attribute: 0.6031215161649944
No of changes in col: restecg :  2
Mean of ekgmo attribute: 5.973995271867612
No of changes in col: ekgmo :  53
Mean of ekgday attribute: 15.493491124260355
No of changes in col: ekgday :  54
Mean of ekgyr attribute: 84.05673758865248
No of changes in col: ekgyr :  53
Mean of dig attribute: 0.03489771359807461
No of changes in col: dig :  68
Mean of prop attribute: 0.28331332533013204
No of changes in col: prop :  66
Mean of nitr attribute: 0.26618705035971224
No of changes in col: nitr :  65
Mean of pro attribute: 0.1722488038277512
No of changes in col: pro :  63
Mean of diuretic attribute: 0.11260709914320685
No of changes in col: diur

In [62]:
# data_final_mean.to_csv('datasets/cleveland_large.csv', index=False)

## Replacing with Mode

In [63]:
for col in cols_with_nan_large:
    col_mode_value = data_final_mode[col].mode()
    print("Mode of", col, "attribute:", col_mode_value)
    data_final_mode[col].fillna(value = col_mode_value, inplace = True)
    comparison_array = np.where(data_final_mode[col] == data_final[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of trestbps attribute: 0    120.0
dtype: float64
No of changes in col: trestbps :  59
Mode of htn attribute: 0    0.0
dtype: float64
No of changes in col: htn :  34
Mode of chol attribute: 0    0.0
dtype: float64
No of changes in col: chol :  30
Mode of fbs attribute: 0    0.0
dtype: float64
No of changes in col: fbs :  90
Mode of restecg attribute: 0    0.0
dtype: float64
No of changes in col: restecg :  2
Mode of ekgmo attribute: 0    3.0
dtype: float64
No of changes in col: ekgmo :  53
Mode of ekgday attribute: 0    16.0
dtype: float64
No of changes in col: ekgday :  54
Mode of ekgyr attribute: 0    85.0
dtype: float64
No of changes in col: ekgyr :  53
Mode of dig attribute: 0    0.0
dtype: float64
No of changes in col: dig :  68
Mode of prop attribute: 0    0.0
dtype: float64
No of changes in col: prop :  66
Mode of nitr attribute: 0    0.0
dtype: float64
No of changes in col: nitr :  65
Mode of pro attribute: 0    0.0
dtype: float64
No of changes in col: pro :  63
Mode of diu

In [64]:
# data_final_mode.to_csv('datasets/cleveland_large.csv', index=False)

## Replacing with Median

In [65]:
for col in cols_with_nan_large:
    col_median_value = data_final_median[col].median()
    print("Mode of", col, "attribute:", col_median_value)
    data_final_median[col].fillna(value = col_median_value, inplace = True)
    comparison_array = np.where(data_final_median[col] == data_final[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of trestbps attribute: 130.0
No of changes in col: trestbps :  59
Mode of htn attribute: 0.0
No of changes in col: htn :  34
Mode of chol attribute: 224.0
No of changes in col: chol :  30
Mode of fbs attribute: 0.0
No of changes in col: fbs :  90
Mode of restecg attribute: 0.0
No of changes in col: restecg :  2
Mode of ekgmo attribute: 6.0
No of changes in col: ekgmo :  53
Mode of ekgday attribute: 16.0
No of changes in col: ekgday :  54
Mode of ekgyr attribute: 84.0
No of changes in col: ekgyr :  53
Mode of dig attribute: 0.0
No of changes in col: dig :  68
Mode of prop attribute: 0.0
No of changes in col: prop :  66
Mode of nitr attribute: 0.0
No of changes in col: nitr :  65
Mode of pro attribute: 0.0
No of changes in col: pro :  63
Mode of diuretic attribute: 0.0
No of changes in col: diuretic :  82
Mode of proto attribute: 5.0
No of changes in col: proto :  112
Mode of thaldur attribute: 8.1
No of changes in col: thaldur :  56
Mode of met attribute: 7.0
No of changes in col: 

In [66]:
# data_final_median.to_csv('datasets/cleveland_large.csv', index=False)

# Separating into separate cleveland, switzerland, hungarian and long beach datasets

In [67]:
data_final_ratio.shape

(899, 41)

# Data row split

* Cleveland - 282
* Hungarian - 294
* Switzerland - 123
* Long Beach VA - 200

In [68]:
cleveland_data = data_final_ratio[0:282].copy()
hungarian_data = data_final_ratio[282:576].copy()
switzerland_data = data_final_ratio[576:699].copy()
longbeach_data = data_final_ratio[699:899].copy()

In [69]:
cleveland_data

Unnamed: 0,age,sex,cp,trestbps,htn,chol,fbs,restecg,ekgmo,ekgday,...,thal,cmo,cday,cyr,num,lvx1,lvx2,lvx3,lvx4,lvf
0,63,1,1,145.0,1.0,233.0,1.0,2.0,2.0,3.0,...,6.0,2.0,16.0,81.0,0,1.0,1.0,1.0,1.0,1.0
1,67,1,4,160.0,1.0,286.0,0.0,2.0,3.0,5.0,...,3.0,2.0,5.0,81.0,2,1.0,1.0,1.0,1.0,1.0
2,67,1,4,120.0,1.0,229.0,0.0,2.0,2.0,19.0,...,7.0,2.0,20.0,81.0,1,1.0,1.0,1.0,7.0,3.0
3,37,1,3,130.0,0.0,250.0,0.0,0.0,2.0,13.0,...,3.0,2.0,4.0,81.0,0,1.0,1.0,1.0,1.0,1.0
4,41,0,2,130.0,1.0,204.0,0.0,2.0,2.0,7.0,...,3.0,2.0,18.0,81.0,0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,39,0,3,138.0,1.0,220.0,0.0,0.0,5.0,8.0,...,3.0,5.0,9.0,84.0,0,1.0,1.0,1.0,1.0,2.0
278,57,1,2,154.0,0.0,232.0,0.0,2.0,6.0,4.0,...,3.0,6.0,5.0,84.0,1,1.0,1.0,1.0,1.0,1.0
279,58,0,4,130.0,1.0,197.0,0.0,0.0,3.0,28.0,...,3.0,3.0,29.0,84.0,0,1.0,1.0,1.0,1.0,1.0
280,57,1,4,110.0,1.0,335.0,0.0,0.0,5.0,3.0,...,7.0,5.0,4.0,84.0,2,1.0,1.0,1.0,1.0,1.0


In [70]:
hungarian_data

Unnamed: 0,age,sex,cp,trestbps,htn,chol,fbs,restecg,ekgmo,ekgday,...,thal,cmo,cday,cyr,num,lvx1,lvx2,lvx3,lvx4,lvf
282,40,1,2,140.0,0.0,289.0,0.0,0.0,12.0,16.0,...,7.0,12.0,20.0,84.0,0,1.0,1.0,1.0,1.0,1.0
283,49,0,3,160.0,1.0,180.0,0.0,0.0,11.0,16.0,...,7.0,11.0,20.0,84.0,1,1.0,1.0,1.0,1.0,1.0
284,37,1,2,130.0,0.0,283.0,0.0,1.0,11.0,21.0,...,6.0,11.0,26.0,84.0,0,1.0,1.0,1.0,1.0,1.0
285,48,0,4,138.0,0.0,214.0,0.0,0.0,9.0,21.0,...,7.0,9.0,30.0,84.0,3,1.0,1.0,1.0,1.0,1.0
286,54,1,3,150.0,0.0,270.0,0.0,0.0,7.0,25.0,...,7.0,7.0,30.0,84.0,0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,48,0,2,128.0,0.0,308.0,0.0,1.0,3.0,7.0,...,3.0,3.0,16.0,85.0,0,1.0,1.0,1.0,1.0,1.0
572,36,1,2,120.0,0.0,166.0,0.0,0.0,3.0,15.0,...,3.0,3.0,17.0,85.0,0,1.0,1.0,1.0,1.0,1.0
573,48,1,3,110.0,0.0,211.0,0.0,0.0,3.0,26.0,...,6.0,3.0,28.0,85.0,0,1.0,1.0,1.0,1.0,1.0
574,47,0,2,140.0,1.0,257.0,0.0,0.0,3.0,3.0,...,7.0,3.0,6.0,86.0,0,1.0,1.0,1.0,1.0,1.0


In [71]:
switzerland_data

Unnamed: 0,age,sex,cp,trestbps,htn,chol,fbs,restecg,ekgmo,ekgday,...,thal,cmo,cday,cyr,num,lvx1,lvx2,lvx3,lvx4,lvf
576,65,1,4,115.0,0.0,0.0,0.0,0.0,1.0,9.0,...,7.0,1.0,11.0,85.0,1,1.0,1.0,1.0,1.0,1.0
577,32,1,1,95.0,1.0,0.0,0.0,0.0,2.0,22.0,...,7.0,2.0,25.0,85.0,1,1.0,1.0,1.0,5.0,1.0
578,61,1,4,105.0,0.0,0.0,0.0,0.0,2.0,25.0,...,7.0,2.0,26.0,85.0,1,1.0,1.0,1.0,1.0,1.0
579,50,1,4,145.0,0.0,0.0,0.0,0.0,2.0,26.0,...,3.0,11.0,30.0,84.0,1,1.0,1.0,1.0,5.0,4.0
580,57,1,4,110.0,0.0,0.0,0.0,1.0,2.0,26.0,...,3.0,2.0,27.0,85.0,3,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
694,54,1,4,180.0,0.0,0.0,0.0,0.0,5.0,6.0,...,7.0,5.0,7.0,85.0,1,1.0,1.0,1.0,1.0,1.0
695,56,1,4,125.0,0.0,0.0,1.0,0.0,6.0,19.0,...,7.0,6.0,20.0,85.0,3,5.0,1.0,1.0,1.0,2.0
696,56,1,3,125.0,0.0,0.0,0.0,0.0,6.0,24.0,...,7.0,6.0,25.0,85.0,2,1.0,1.0,5.0,3.0,2.0
697,54,1,4,130.0,0.0,0.0,0.0,0.0,7.0,3.0,...,7.0,7.0,4.0,85.0,3,1.0,1.0,1.0,1.0,1.0


In [72]:
longbeach_data

Unnamed: 0,age,sex,cp,trestbps,htn,chol,fbs,restecg,ekgmo,ekgday,...,thal,cmo,cday,cyr,num,lvx1,lvx2,lvx3,lvx4,lvf
699,63,1,4,140.0,0.0,260.0,0.0,1.0,1.0,22.0,...,7.0,2.0,27.0,85.0,2,1.0,1.0,1.0,1.0,1.0
700,44,1,4,130.0,0.0,209.0,0.0,1.0,7.0,23.0,...,3.0,7.0,31.0,84.0,0,1.0,1.0,1.0,1.0,1.0
701,60,1,4,132.0,1.0,218.0,0.0,1.0,6.0,12.0,...,3.0,7.0,30.0,84.0,2,1.0,1.0,1.0,7.0,2.0
702,55,1,4,142.0,0.0,228.0,0.0,1.0,9.0,7.0,...,7.0,9.0,14.0,84.0,1,1.0,1.0,1.0,1.0,1.0
703,66,1,3,110.0,1.0,213.0,1.0,2.0,4.0,25.0,...,6.0,5.0,3.0,86.0,0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,54,0,4,127.0,0.0,333.0,1.0,1.0,6.0,6.0,...,3.0,6.0,29.0,83.0,1,1.0,1.0,1.0,1.0,1.0
895,62,1,1,120.0,0.0,139.0,0.0,1.0,10.0,5.0,...,3.0,5.0,26.0,86.0,0,1.0,1.0,1.0,1.0,2.0
896,55,1,4,122.0,1.0,223.0,1.0,1.0,5.0,2.0,...,6.0,4.0,17.0,86.0,2,1.0,1.0,1.0,1.0,1.0
897,58,1,4,110.0,0.0,385.0,1.0,2.0,11.0,4.0,...,3.0,2.0,16.0,83.0,0,1.0,1.0,1.0,1.0,1.0


In [73]:
cleveland_data.to_csv('datasets/cleveland_01.csv', index=False)
hungarian_data.to_csv('datasets/hungarian_01.csv', index=False)
switzerland_data.to_csv('datasets/switzerland_01.csv', index=False)
longbeach_data.to_csv('datasets/longbeach_01.csv', index=False)