# Data Cleaning and Processing

Here we take in all the .data files in Heart Disease UCI database and format them to final CSV file which we can use for analysis.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Attributes Assigning

The following cell shows all attributes and selected 14 attributes

In [2]:
headers = [
    "id", "ccf", "age", "sex",  "painloc", "painexer", "relrest", "pncaden", "cp", "trestbps",  "htn", "chol", "smoke", "cigs", "years", "fbs",  "dm", "famhist", "restecg",
    "ekgmo", "ekgday", "ekgyr", "dig",  "prop", "nitr", "pro", "diuretic", "proto", "thaldur",  "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd",  "dummy", "trestbpd", "exang",
    "xhypo", "oldpeak", "slope", "rldv5",  "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm",  "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe",  "cmo", "cday", "cyr",
    "num", "lmt", "ladprox", "laddist",  "diag", "cxmain", "ramus", "om1", "om2", "rcaprox",  "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf",  "cathef", "junk", "name"
]

selected_columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"
]

# Dataset Format

The datafile contains contiguous elements seperated by a newline or space. Each row of the required dataset is observed to be 10 lines in the datafile in the following format:
1. The first line contains 7 elements
2. Lines 2-9 contains 8 elements
3. Line 10 contains 5 elements.
4. The last element belongs to attribute Name and contains string 'name' always

This information is used for data cleaning

## Finding Errors in Data / Missing Data

### Cleveland Dataset

In [3]:
cleveland_datafile = open('raw_data/cleveland.data', 'r', errors='ignore')

cleveland_wrong_data_lines = list()
cleveland_wrong_name_lines = list()

i = 1

for line in cleveland_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            cleveland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            cleveland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            cleveland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            cleveland_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1
    
print(i)

cleveland_datafile.close()

2983


In [4]:
cleveland_wrong_data_lines

[{'line_no': 2822, 'line_size': 11},
 {'line_no': 2823, 'line_size': 4},
 {'line_no': 2824, 'line_size': 2},
 {'line_no': 2825, 'line_size': 3},
 {'line_no': 2826, 'line_size': 3},
 {'line_no': 2827, 'line_size': 15},
 {'line_no': 2828, 'line_size': 1},
 {'line_no': 2829, 'line_size': 3},
 {'line_no': 2830, 'line_size': 7},
 {'line_no': 2831, 'line_size': 11},
 {'line_no': 2832, 'line_size': 3},
 {'line_no': 2833, 'line_size': 5},
 {'line_no': 2834, 'line_size': 6},
 {'line_no': 2835, 'line_size': 5},
 {'line_no': 2836, 'line_size': 5},
 {'line_no': 2837, 'line_size': 10},
 {'line_no': 2838, 'line_size': 9},
 {'line_no': 2839, 'line_size': 7},
 {'line_no': 2840, 'line_size': 16},
 {'line_no': 2841, 'line_size': 3},
 {'line_no': 2842, 'line_size': 5},
 {'line_no': 2843, 'line_size': 13},
 {'line_no': 2844, 'line_size': 4},
 {'line_no': 2845, 'line_size': 2},
 {'line_no': 2846, 'line_size': 9},
 {'line_no': 2847, 'line_size': 10},
 {'line_no': 2848, 'line_size': 12},
 {'line_no': 2849, '

In [5]:
cleveland_wrong_name_lines

[{'line_no': 2830, 'line_size': 7},
 {'line_no': 2840, 'line_size': 16},
 {'line_no': 2850, 'line_size': 3},
 {'line_no': 2860, 'line_size': 11},
 {'line_no': 2870, 'line_size': 8},
 {'line_no': 2880, 'line_size': 16},
 {'line_no': 2890, 'line_size': 5},
 {'line_no': 2900, 'line_size': 4},
 {'line_no': 2910, 'line_size': 3},
 {'line_no': 2930, 'line_size': 4},
 {'line_no': 2940, 'line_size': 1},
 {'line_no': 2950, 'line_size': 8},
 {'line_no': 2960, 'line_size': 4},
 {'line_no': 2970, 'line_size': 2},
 {'line_no': 2980, 'line_size': 7}]

### Inference

Here we observe that rows after 2820 (282 data points) have lots of errors. This can also be seen by manually checking the data file as well. So we only consider until line 2820 in the data file.

### Hungarian Dataset

In [6]:
hungarian_datafile = open('raw_data/hungarian.data', 'r', errors = 'ignore')

hungarian_wrong_data_lines = list()
hungarian_wrong_name_lines = list()

i = 1

for line in hungarian_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            hungarian_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            hungarian_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            hungarian_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            hungarian_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1

print(i)

hungarian_datafile.close()

2942


In [7]:
hungarian_wrong_data_lines

[{'line_no': 2941, 'line_size': 0}]

In [8]:
hungarian_wrong_name_lines

[]

### Inference

No errors found. 2940 Lines of Data is present which will account to 294 rows/datapoints

### Switzerland Dataset

In [9]:
switzerland_datafile = open('raw_data/switzerland.data', 'r', errors = 'ignore')

switzerland_wrong_data_lines = list()
switzerland_wrong_name_lines = list()

i = 1

for line in switzerland_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            switzerland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            switzerland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            switzerland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            switzerland_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1


print(i)

switzerland_datafile.close()

1231


In [10]:
switzerland_wrong_data_lines

[]

In [11]:
switzerland_wrong_name_lines

[]

### Inference

No errors found. 1230 Lines of Data is present which will account to 123 rows/datapoints

### Long Beach VA Dataset

In [12]:
longbeach_datafile = open('raw_data/long-beach-va.data', 'r', errors = 'ignore')

longbeach_wrong_data_lines = list()
longbeach_wrong_name_lines = list()

i = 1

for line in longbeach_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            longbeach_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            longbeach_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            longbeach_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            longbeach_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1


print(i)

longbeach_datafile.close()

2002


In [13]:
longbeach_wrong_data_lines

[{'line_no': 2001, 'line_size': 0}]

In [14]:
longbeach_wrong_name_lines

[]

### Inference

No errors found. 2000 Lines of Data is present which will account to 200 rows/datapoints

## Creating the Dataset

In [15]:
cleveland_datafile = open('raw_data/cleveland.data', 'r', errors = 'ignore')
hungarian_datafile = open('raw_data/hungarian.data', 'r', errors = 'ignore')
switzerland_datafile = open('raw_data/switzerland.data', 'r', errors = 'ignore')
longbeach_datafile = open('raw_data/long-beach-va.data', 'r', errors = 'ignore')

dataset = list()
datarow = list()

i = 0

for line in cleveland_datafile:
    i += 1
    if i > 2820:
        break
    linecontent = line.split()
    datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = datarow.copy()
        if len(row_clone)== 76:
            dataset.append(row_clone)
        else:
            print(row_clone)
        datarow.clear()
        
print(len(dataset))

i = 0

for line in hungarian_datafile:
    i += 1
    if i > 2940:
        break
    linecontent = line.split()
    datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = datarow.copy()
        if len(row_clone)== 76:
            dataset.append(row_clone)
        else:
            print(row_clone)
        datarow.clear()

print(len(dataset))

i = 0

for line in switzerland_datafile:
    i += 1
    if i > 1230:
        break
    linecontent = line.split()
    datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = datarow.copy()
        if len(row_clone)== 76:
            dataset.append(row_clone)
        else:
            print(row_clone)
        datarow.clear()
i = 0

for line in longbeach_datafile:
    i += 1
    if i > 2000:
        break
    linecontent = line.split()
    datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = datarow.copy()
        if len(row_clone)== 76:
            dataset.append(row_clone)
        else:
            print(row_clone)
        datarow.clear()
        
cleveland_datafile.close()
hungarian_datafile.close()
switzerland_datafile.close()
longbeach_datafile.close()

282
576


In [16]:
len(dataset)

899

In [17]:
df = pd.DataFrame(dataset, columns=headers)
df.to_csv('datasets/alldata.csv', index=False)

In [18]:
del df

In [19]:
data = pd.read_csv('datasets/alldata.csv')

In [20]:
data.head()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
0,1,0,63,1,-9,-9,-9,-9,1,145,...,1,1,1,1,1,1,1,-9.0,-9.0,name
1,2,0,67,1,-9,-9,-9,-9,4,160,...,1,1,1,1,1,1,1,-9.0,-9.0,name
2,3,0,67,1,-9,-9,-9,-9,4,120,...,2,2,1,1,1,7,3,-9.0,-9.0,name
3,4,0,37,1,-9,-9,-9,-9,3,130,...,1,1,1,1,1,1,1,-9.0,-9.0,name
4,6,0,41,0,-9,-9,-9,-9,2,130,...,1,1,1,1,1,1,1,-9.0,-9.0,name


In [21]:
data.describe()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,...,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0
mean,957.235818,0.0,53.480534,0.790879,-2.191324,-2.416018,-2.404894,-9.0,3.253615,122.840934,...,-5.338154,-1.476085,-1.883204,0.808676,0.820912,0.918799,1.387097,0.997775,3.669399,-7.031702
std,1204.015482,0.0,9.435894,0.406908,4.611051,4.472187,4.523955,0.0,0.928499,39.55893,...,4.848162,4.625416,4.675999,1.467955,1.501361,1.615995,2.288076,1.439189,25.498556,5.077432
min,1.0,0.0,28.0,0.0,-9.0,-9.0,-9.0,-9.0,1.0,-9.0,...,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0
25%,116.0,0.0,47.0,1.0,-9.0,-9.0,-9.0,-9.0,3.0,120.0,...,-9.0,-9.0,-9.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
50%,266.0,0.0,54.0,1.0,1.0,0.0,0.0,-9.0,4.0,130.0,...,-9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
75%,1207.5,0.0,60.0,1.0,1.0,1.0,1.0,-9.0,4.0,140.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7,-9.0
max,5002.0,0.0,77.0,1.0,1.0,1.0,1.0,-9.0,4.0,200.0,...,2.0,2.0,2.0,7.0,10.0,8.0,8.0,5.0,86.0,11.3


# Removing ID and Name

Now, we have the complete dataset as a dataframe. 

The 'name' field and 'id' field provide no practical use and should be dropped

In [22]:
data_01 = data.drop(['id', 'name'], axis = 1)

In [23]:
data_01.describe()

Unnamed: 0,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,...,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0
mean,0.0,53.480534,0.790879,-2.191324,-2.416018,-2.404894,-9.0,3.253615,122.840934,0.117909,...,-5.338154,-1.476085,-1.883204,0.808676,0.820912,0.918799,1.387097,0.997775,3.669399,-7.031702
std,0.0,9.435894,0.406908,4.611051,4.472187,4.523955,0.0,0.928499,39.55893,1.873952,...,4.848162,4.625416,4.675999,1.467955,1.501361,1.615995,2.288076,1.439189,25.498556,5.077432
min,0.0,28.0,0.0,-9.0,-9.0,-9.0,-9.0,1.0,-9.0,-9.0,...,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0
25%,0.0,47.0,1.0,-9.0,-9.0,-9.0,-9.0,3.0,120.0,0.0,...,-9.0,-9.0,-9.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
50%,0.0,54.0,1.0,1.0,0.0,0.0,-9.0,4.0,130.0,0.0,...,-9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
75%,0.0,60.0,1.0,1.0,1.0,1.0,-9.0,4.0,140.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7,-9.0
max,0.0,77.0,1.0,1.0,1.0,1.0,-9.0,4.0,200.0,1.0,...,2.0,2.0,2.0,7.0,10.0,8.0,8.0,5.0,86.0,11.3


In [24]:
data_01.head()

Unnamed: 0,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
0,0,63,1,-9,-9,-9,-9,1,145,1,...,-9,1,1,1,1,1,1,1,-9.0,-9.0
1,0,67,1,-9,-9,-9,-9,4,160,1,...,-9,1,1,1,1,1,1,1,-9.0,-9.0
2,0,67,1,-9,-9,-9,-9,4,120,1,...,-9,2,2,1,1,1,7,3,-9.0,-9.0
3,0,37,1,-9,-9,-9,-9,3,130,0,...,-9,1,1,1,1,1,1,1,-9.0,-9.0
4,0,41,0,-9,-9,-9,-9,2,130,1,...,-9,1,1,1,1,1,1,1,-9.0,-9.0


In [25]:
data_01.to_csv('datasets/alldata_no_id.csv', index=False)

The dataset containing no id or name is saved as 'alldata_no_id.csv'

# Removing attributes with constant value

We can drop attributes which has the same value in all rows. So we find the attributes which have the same value throughout

In [26]:
drop_cols_01 = ['ccf', 'painloc', 'painexer', 'relrest', 'pncaden', 'smoke', 'proto', 'rldv5', 'restckm', 'exerckm', 'restef', 'restwm', 'exeref', 'exerwm', 'thalsev', 'thalpul', 'earlobe', 'diag', 'ramus', 'om2', 'lvx1', 'lvx2', 'cathef', 'junk']

data_final_01 = data_01.drop(drop_cols_01, axis=1)

In [27]:
drop_cols = list()

for col in data_final_01.columns.values:
    if data_final_01[col].std() == 0:
        drop_cols.append(col)
        
print(drop_cols)

[]


In [28]:
len(drop_cols)

0

In [29]:
data_final = data_final_01.drop(drop_cols, axis=1)

In [30]:
data_final.head()

Unnamed: 0,age,sex,cp,trestbps,htn,chol,cigs,years,fbs,dm,...,lmt,ladprox,laddist,cxmain,om1,rcaprox,rcadist,lvx3,lvx4,lvf
0,63,1,1,145,1,233,50,20,1,-9,...,1,1,1,1,1,1,1,1,1,1
1,67,1,4,160,1,286,40,40,0,-9,...,1,2,2,2,1,1,1,1,1,1
2,67,1,4,120,1,229,20,35,0,-9,...,1,1,1,1,1,2,2,1,7,3
3,37,1,3,130,0,250,0,0,0,-9,...,1,1,1,1,1,1,1,1,1,1
4,41,0,2,130,1,204,0,0,0,-9,...,1,1,1,1,1,1,1,1,1,1


In [31]:
data_final.describe()

Unnamed: 0,age,sex,cp,trestbps,htn,chol,cigs,years,fbs,dm,...,lmt,ladprox,laddist,cxmain,om1,rcaprox,rcadist,lvx3,lvx4,lvf
count,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,...,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0
mean,53.480534,0.790879,3.253615,122.840934,0.117909,191.826474,5.982202,5.439377,-0.750834,-7.94772,...,-1.83426,-1.38376,-1.556174,-1.394883,-1.89099,-1.476085,-1.883204,0.918799,1.387097,0.997775
std,9.435894,0.406908,0.928499,39.55893,1.873952,116.116297,19.370716,18.219811,2.775607,3.063642,...,7.175986,4.564388,4.586209,4.543874,4.683444,4.625416,4.675999,1.615995,2.288076,1.439189
min,28.0,0.0,1.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,...,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0
25%,47.0,1.0,3.0,120.0,0.0,162.0,-9.0,-9.0,0.0,-9.0,...,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,1.0,1.0
50%,54.0,1.0,4.0,130.0,0.0,222.0,0.0,0.0,0.0,-9.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,60.0,1.0,4.0,140.0,1.0,267.0,20.0,20.0,0.0,-9.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,77.0,1.0,4.0,200.0,1.0,603.0,99.0,60.0,1.0,1.0,...,162.0,2.0,2.0,2.0,2.0,2.0,2.0,8.0,8.0,5.0


In [32]:
data_final.to_csv('datasets/alldata_final.csv', index=False)

# Creating a dataset of only selected 14 attributes

Usually only 14 attributes are used for data analysis of this particular dataset. For general testing, this is also taken and considered.

In [33]:
data_selected = data_final[selected_columns]

In [34]:
data_selected.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [35]:
data_selected.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0
mean,53.480534,0.790879,3.253615,122.840934,191.826474,-0.750834,0.581758,128.348165,-0.183537,0.189766,-1.922136,-5.860957,-2.419355,1.129032
std,9.435894,0.406908,0.928499,39.55893,116.116297,2.775607,0.921617,43.169759,2.30104,2.71102,5.137156,4.579126,7.126357,1.25972
min,28.0,0.0,1.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,0.0
25%,47.0,1.0,3.0,120.0,162.0,0.0,0.0,115.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
50%,54.0,1.0,4.0,130.0,222.0,0.0,0.0,137.0,0.0,0.2,1.0,-9.0,-9.0,1.0
75%,60.0,1.0,4.0,140.0,267.0,0.0,1.0,155.0,1.0,1.5,2.0,0.0,5.0,2.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,9.0,7.0,4.0


In [36]:
data_selected.to_csv('datasets/alldata_short.csv', index=False)