# Data Cleaning and Pre Processing

Here we take in the cleveland.data file from UCI Heart Disease database and format it to final CSV file which we can use for analysis.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Attributes Assigning

The following cell shows all attributes and selected 14 attributes

In [2]:
# Names of all attributes in UCI Heart Disease Dataset
headers = [
    "id", "ccf", "age", "sex",  "painloc", "painexer", "relrest", "pncaden", "cp", "trestbps",  "htn", "chol", "smoke", "cigs", "years", "fbs",  "dm", "famhist", "restecg",
    "ekgmo", "ekgday", "ekgyr", "dig",  "prop", "nitr", "pro", "diuretic", "proto", "thaldur",  "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd",  "dummy", "trestbpd", "exang",
    "xhypo", "oldpeak", "slope", "rldv5",  "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm",  "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe",  "cmo", "cday", "cyr",
    "num", "lmt", "ladprox", "laddist",  "diag", "cxmain", "ramus", "om1", "om2", "rcaprox",  "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf",  "cathef", "junk", "name"
]

# Names of all attributes in short version of UCI dataset
selected_columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"
]

## Dataset Format

The datafile contains contiguous elements seperated by a newline or space. Each row of the required dataset is observed to be 10 lines in the datafile in the following format:
1. The first line contains 7 elements
2. Lines 2-9 contains 8 elements
3. Line 10 contains 5 elements.
4. The last element belongs to attribute Name and contains string 'name' always

This information is used for data cleaning

# Switzerland Datafile

## Finding Errors in Data / Missing Data

In [3]:
# Opens switzerland.data, iterates lines, takes in data and stores as a list of rows
switzerland_datafile = open('raw_data/switzerland.data', 'r', errors='ignore')

switzerland_wrong_data_lines = list() # Stores datalines with incorrect size of elements
switzerland_wrong_name_lines = list() # Stores datalines with wrong name attribute

i = 1

for line in switzerland_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            switzerland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            switzerland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            switzerland_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            switzerland_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1

print("Current i value (no of lines): ", i)

switzerland_datafile.close()

print("Lines with wrong number of elements are: ")
print(switzerland_wrong_name_lines)

print("Lines with wrong name attribute: ")
print(switzerland_wrong_data_lines)

Current i value (no of lines):  1231
Lines with wrong number of elements are: 
[]
Lines with wrong name attribute: 
[]


## Inference

Here, we see that 1230 lines of data exist amounting to around 123 data points.

## Creating the Dataset

In [4]:
switzerland_datafile = open('raw_data/switzerland.data', 'r', errors='ignore')

switzerland_dataset = list()
switzerland_datarow = list()

i = 0

for line in switzerland_datafile:
    i += 1
    if i > 1230:
        break
    linecontent = line.split()
    switzerland_datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = switzerland_datarow.copy()
        if len(row_clone)== 76:
            switzerland_dataset.append(row_clone)
        else:
            print(row_clone)
        switzerland_datarow.clear()
        
switzerland_datafile.close()

print("No of datapoints: ", len(switzerland_dataset))

df = pd.DataFrame(switzerland_dataset, columns=headers)
# The first version of the dataset is a direct raw conversion of the .data file to .csv file
df.to_csv('datasets/switzerland_raw.csv', index=False) 

del df

No of datapoints:  123


In [5]:
switzerland_data = pd.read_csv('datasets/switzerland_raw.csv')
switzerland_data.head()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
0,3001,0,65,1,1,1,1,-9,4,115,...,1,1,1,1,1,1,1,75.0,-9.0,name
1,3002,0,32,1,0,0,0,-9,1,95,...,1,1,1,1,1,5,1,63.0,-9.0,name
2,3003,0,61,1,1,1,1,-9,4,105,...,2,1,1,1,1,1,1,67.0,-9.0,name
3,3004,0,50,1,1,1,1,-9,4,145,...,1,1,1,1,1,5,4,36.0,-9.0,name
4,3005,0,57,1,1,1,1,-9,4,110,...,2,1,1,1,1,1,1,60.0,-9.0,name


In [6]:
switzerland_data.describe()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,...,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0
mean,3625.886179,0.0,55.317073,0.918699,0.910569,0.821138,0.780488,-9.0,3.699187,127.943089,...,1.056911,1.422764,1.235772,-0.235772,-0.276423,-0.170732,0.065041,0.382114,53.845528,-9.0
std,499.271374,0.0,9.032108,0.274414,0.286532,0.384804,0.415609,0.0,0.688726,28.514679,...,0.232619,0.496019,0.426217,3.59891,3.576565,3.676927,3.916631,3.310491,26.631989,0.0
min,3001.0,0.0,32.0,0.0,0.0,0.0,0.0,-9.0,1.0,-9.0,...,1.0,1.0,1.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0
25%,3031.5,0.0,51.0,1.0,1.0,1.0,1.0,-9.0,4.0,115.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,52.5,-9.0
50%,4012.0,0.0,56.0,1.0,1.0,1.0,1.0,-9.0,4.0,125.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,63.0,-9.0
75%,4042.5,0.0,61.5,1.0,1.0,1.0,1.0,-9.0,4.0,145.0,...,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,69.0,-9.0
max,4074.0,0.0,74.0,1.0,1.0,1.0,1.0,-9.0,4.0,200.0,...,2.0,2.0,2.0,7.0,7.0,7.0,8.0,4.0,86.0,-9.0


In [7]:
switzerland_data = switzerland_data.replace(-9, np.nan)

In [8]:
# Removing ID and Name columns as they are useless

switzerland_data = switzerland_data.drop(['id', 'name'], axis = 1)

In [9]:
# Removing columns with constant data

drop_cols = list()

for col in switzerland_data.columns.values:
    if switzerland_data[col].std() == 0:
        drop_cols.append(col)
        
print("Columns to be dropped: ", drop_cols)
print("No of rows to be dropped: ", len(drop_cols))

Columns to be dropped:  ['ccf', 'chol', 'dm', 'famhist', 'proto']
No of rows to be dropped:  5


In [10]:
# Removing columns with all nan values

for col in switzerland_data.columns.values:
    if np.isnan(switzerland_data[col].std()):
        drop_cols.append(col)
        
print(drop_cols)
print("No of rows to be dropped: ", len(drop_cols))

['ccf', 'chol', 'dm', 'famhist', 'proto', 'pncaden', 'restckm', 'exerckm', 'restef', 'restwm', 'exeref', 'exerwm', 'earlobe', 'junk']
No of rows to be dropped:  14


In [11]:
# Dropping all unnecessary columns

switzerland_data = switzerland_data.drop(drop_cols, axis=1)

In [12]:
# Identifying columns with NaN values

cols_with_nan = switzerland_data.columns.values[switzerland_data.isna().any()]
display(cols_with_nan)

array(['trestbps', 'htn', 'smoke', 'cigs', 'years', 'fbs', 'restecg',
       'ekgmo', 'ekgday', 'ekgyr', 'dig', 'prop', 'nitr', 'pro',
       'diuretic', 'thaldur', 'thaltime', 'met', 'thalach', 'thalrest',
       'tpeakbps', 'tpeakbpd', 'dummy', 'trestbpd', 'exang', 'xhypo',
       'oldpeak', 'slope', 'rldv5', 'rldv5e', 'ca', 'thal', 'thalsev',
       'thalpul', 'cmo', 'cday', 'cyr', 'lvx1', 'lvx2', 'lvx3', 'lvx4',
       'lvf', 'cathef'], dtype=object)

In [13]:
nan_count = dict()

for col in cols_with_nan:
    nan_count[col] = switzerland_data[col].isna().sum()
    print('Column Name: ', col)
    print('NaN Count: ', nan_count[col])
    print(switzerland_data[col].value_counts())
    print("\n")

Column Name:  trestbps
NaN Count:  2
115.0    14
120.0    13
160.0    11
110.0    10
140.0    10
130.0    10
125.0     8
150.0     7
145.0     6
95.0      6
135.0     6
105.0     5
155.0     4
100.0     4
200.0     2
80.0      1
185.0     1
170.0     1
165.0     1
180.0     1
Name: trestbps, dtype: int64


Column Name:  htn
NaN Count:  30
0.0    60
1.0    33
Name: htn, dtype: int64


Column Name:  smoke
NaN Count:  100
1.0    18
0.0     5
Name: smoke, dtype: int64


Column Name:  cigs
NaN Count:  112
20.0    8
40.0    3
Name: cigs, dtype: int64


Column Name:  years
NaN Count:  121
50.0    1
40.0    1
Name: years, dtype: int64


Column Name:  fbs
NaN Count:  75
0.0    43
1.0     5
Name: fbs, dtype: int64


Column Name:  restecg
NaN Count:  1
0.0    85
1.0    30
2.0     7
Name: restecg, dtype: int64


Column Name:  ekgmo
NaN Count:  1
1.0     22
3.0     21
2.0     18
4.0     15
5.0      8
8.0      7
9.0      7
7.0      7
6.0      6
12.0     6
10.0     3
11.0     2
Name: ekgmo, dtype: in

In [14]:
print("No of rows with nan : ", sum([True for index,row in switzerland_data.iterrows() if any(row.isnull())]))

No of rows with nan :  123


## Filling NaN with Other Values

Here we fill NaN with other values in the column using probability of finding the value in the column as the criteria

In [15]:
cols_to_be_removed = list()

for col in cols_with_nan:
    if switzerland_data[col].isna().sum() > 0.25*switzerland_data.shape[0] and col not in selected_columns:
        cols_to_be_removed.append(col)
        
print(cols_to_be_removed)

['smoke', 'cigs', 'years', 'thaltime', 'met', 'rldv5', 'rldv5e', 'thalsev', 'thalpul']


In [16]:
# Dropping these columns

switzerland_data = switzerland_data.drop(cols_to_be_removed, axis = 1)

In [17]:
switzerland_ratio = switzerland_data.copy()
switzerland_mean = switzerland_data.copy()
switzerland_mode = switzerland_data.copy()
switzerland_median = switzerland_data.copy()

In [18]:
print("No of rows with nan : ", sum([True for index,row in switzerland_data.iterrows() if any(row.isnull())]))

No of rows with nan :  123


## Imputing with proportion of each value in dataset

In [19]:
cols_with_nan = switzerland_ratio.columns.values[switzerland_ratio.isna().any()]

for col in cols_with_nan:
    print("Col Name: ", col, "")
    
    col_duplicate = switzerland_ratio[col].copy()
    
    for index, val in col_duplicate.iteritems():
        if np.isnan(val):
            print(index, val)
        
    col_probabilities = []
        
    for index, val in col_duplicate.value_counts().iteritems():
        col_probabilities.append(round(val/col_duplicate.value_counts().sum(), 2))
    
    for i in range(len(col_probabilities)):
        if round(sum(col_probabilities), 2) >= 1:
            break
        col_probabilities[i] += 0.01
    
    for i in range(len(col_probabilities)):
        if round(sum(col_probabilities), 2) <= 1:
            break
        col_probabilities[i] -= 0.01
    
    col_values = col_duplicate.value_counts().index.values.tolist()
        
    col_duplicate = col_duplicate.fillna(pd.Series(np.random.choice(col_values, 
                                                                      p=col_probabilities, size=len(col_duplicate))))
        
    for index, val in switzerland_ratio[col].iteritems():
        if col_duplicate[index] != val:
            print("Index: ", index, ", ", col, " duplicate value: ", col_duplicate[index], ", original value: ", val)
            
    switzerland_ratio[col] = col_duplicate.copy()
    
    print(col, "done!!!\n")

Col Name:  trestbps 
55 nan
112 nan
Index:  55 ,  trestbps  duplicate value:  115.0 , original value:  nan
Index:  112 ,  trestbps  duplicate value:  145.0 , original value:  nan
trestbps done!!!

Col Name:  htn 
5 nan
9 nan
11 nan
12 nan
18 nan
19 nan
21 nan
23 nan
25 nan
28 nan
29 nan
31 nan
32 nan
33 nan
34 nan
35 nan
36 nan
38 nan
39 nan
40 nan
41 nan
42 nan
43 nan
46 nan
47 nan
48 nan
49 nan
57 nan
58 nan
101 nan
Index:  5 ,  htn  duplicate value:  0.0 , original value:  nan
Index:  9 ,  htn  duplicate value:  1.0 , original value:  nan
Index:  11 ,  htn  duplicate value:  0.0 , original value:  nan
Index:  12 ,  htn  duplicate value:  0.0 , original value:  nan
Index:  18 ,  htn  duplicate value:  0.0 , original value:  nan
Index:  19 ,  htn  duplicate value:  0.0 , original value:  nan
Index:  21 ,  htn  duplicate value:  0.0 , original value:  nan
Index:  23 ,  htn  duplicate value:  1.0 , original value:  nan
Index:  25 ,  htn  duplicate value:  0.0 , original value:  nan
Inde

55 nan
Index:  55 ,  pro  duplicate value:  0.0 , original value:  nan
pro done!!!

Col Name:  diuretic 
1 nan
3 nan
4 nan
41 nan
43 nan
55 nan
Index:  1 ,  diuretic  duplicate value:  0.0 , original value:  nan
Index:  3 ,  diuretic  duplicate value:  1.0 , original value:  nan
Index:  4 ,  diuretic  duplicate value:  0.0 , original value:  nan
Index:  41 ,  diuretic  duplicate value:  1.0 , original value:  nan
Index:  43 ,  diuretic  duplicate value:  1.0 , original value:  nan
Index:  55 ,  diuretic  duplicate value:  0.0 , original value:  nan
diuretic done!!!

Col Name:  thaldur 
55 nan
Index:  55 ,  thaldur  duplicate value:  6.0 , original value:  nan
thaldur done!!!

Col Name:  thalach 
55 nan
Index:  55 ,  thalach  duplicate value:  97.0 , original value:  nan
thalach done!!!

Col Name:  thalrest 
55 nan
Index:  55 ,  thalrest  duplicate value:  86.0 , original value:  nan
thalrest done!!!

Col Name:  tpeakbps 
13 nan
55 nan
112 nan
Index:  13 ,  tpeakbps  duplicate value:  1

49 nan
93 nan
Index:  1 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  2 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  3 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  4 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  5 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  6 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  7 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  8 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  9 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  10 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  11 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  12 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  13 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  14 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  15 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  16 ,  thal  

Index:  7 ,  lvx4  duplicate value:  1.0 , original value:  nan
Index:  13 ,  lvx4  duplicate value:  1.0 , original value:  nan
Index:  14 ,  lvx4  duplicate value:  1.0 , original value:  nan
Index:  17 ,  lvx4  duplicate value:  1.0 , original value:  nan
Index:  22 ,  lvx4  duplicate value:  1.0 , original value:  nan
Index:  36 ,  lvx4  duplicate value:  1.0 , original value:  nan
Index:  40 ,  lvx4  duplicate value:  1.0 , original value:  nan
Index:  41 ,  lvx4  duplicate value:  4.0 , original value:  nan
Index:  42 ,  lvx4  duplicate value:  4.0 , original value:  nan
Index:  44 ,  lvx4  duplicate value:  1.0 , original value:  nan
Index:  47 ,  lvx4  duplicate value:  3.0 , original value:  nan
Index:  48 ,  lvx4  duplicate value:  1.0 , original value:  nan
Index:  49 ,  lvx4  duplicate value:  1.0 , original value:  nan
Index:  60 ,  lvx4  duplicate value:  8.0 , original value:  nan
Index:  64 ,  lvx4  duplicate value:  3.0 , original value:  nan
Index:  89 ,  lvx4  duplic

In [20]:
print("No of rows with nan : ", sum([True for index,row in switzerland_ratio.iterrows() if any(row.isnull())]))

No of rows with nan :  0


In [21]:
switzerland_ratio.to_csv('datasets/switzerland_large.csv', index=False)

## Replacing with Mean

In [22]:
for col in cols_with_nan:
    col_mean_value = switzerland_mean[col].mean()
    print("Mean of", col, "attribute:", col_mean_value)
    switzerland_mean[col].fillna(value = col_mean_value, inplace = True)
    comparison_array = np.where(switzerland_mean[col] == switzerland_data[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mean of trestbps attribute: 130.20661157024793
No of changes in col: trestbps :  2
Mean of htn attribute: 0.3548387096774194
No of changes in col: htn :  30
Mean of fbs attribute: 0.10416666666666667
No of changes in col: fbs :  75
Mean of restecg attribute: 0.36065573770491804
No of changes in col: restecg :  1
Mean of ekgmo attribute: 4.5
No of changes in col: ekgmo :  1
Mean of ekgday attribute: 15.721311475409836
No of changes in col: ekgday :  1
Mean of ekgyr attribute: 84.6311475409836
No of changes in col: ekgyr :  1
Mean of dig attribute: 0.01694915254237288
No of changes in col: dig :  5
Mean of prop attribute: 0.4214876033057851
No of changes in col: prop :  2
Mean of nitr attribute: 0.375
No of changes in col: nitr :  3
Mean of pro attribute: 0.45901639344262296
No of changes in col: pro :  1
Mean of diuretic attribute: 0.20512820512820512
No of changes in col: diuretic :  6
Mean of thaldur attribute: 7.427049180327868
No of changes in col: thaldur :  1
Mean of thalach attri

In [23]:
# switzerland_mean.to_csv('datasets/switzerland_large.csv', index=False)

## Replacing with Mode

In [24]:
for col in cols_with_nan:
    col_mode_value = switzerland_mode[col].mode()
    print("Mode of", col, "attribute:", col_mode_value)
    switzerland_mode[col].fillna(value = col_mode_value, inplace = True)
    comparison_array = np.where(switzerland_mode[col] == switzerland_data[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of trestbps attribute: 0    115.0
dtype: float64
No of changes in col: trestbps :  2
Mode of htn attribute: 0    0.0
dtype: float64
No of changes in col: htn :  30
Mode of fbs attribute: 0    0.0
dtype: float64
No of changes in col: fbs :  75
Mode of restecg attribute: 0    0.0
dtype: float64
No of changes in col: restecg :  1
Mode of ekgmo attribute: 0    1.0
dtype: float64
No of changes in col: ekgmo :  1
Mode of ekgday attribute: 0    6.0
dtype: float64
No of changes in col: ekgday :  1
Mode of ekgyr attribute: 0    85.0
dtype: float64
No of changes in col: ekgyr :  1
Mode of dig attribute: 0    0.0
dtype: float64
No of changes in col: dig :  5
Mode of prop attribute: 0    0.0
dtype: float64
No of changes in col: prop :  2
Mode of nitr attribute: 0    0.0
dtype: float64
No of changes in col: nitr :  3
Mode of pro attribute: 0    0.0
dtype: float64
No of changes in col: pro :  1
Mode of diuretic attribute: 0    0.0
dtype: float64
No of changes in col: diuretic :  6
Mode of thald

In [25]:
# switzerland_mode.to_csv('datasets/switzerland_large.csv', index=False)

## Replacing with Median

In [26]:
for col in cols_with_nan:
    col_median_value = switzerland_median[col].median()
    print("Mode of", col, "attribute:", col_median_value)
    switzerland_median[col].fillna(value = col_median_value, inplace = True)
    comparison_array = np.where(switzerland_median[col] == switzerland_data[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of trestbps attribute: 125.0
No of changes in col: trestbps :  2
Mode of htn attribute: 0.0
No of changes in col: htn :  30
Mode of fbs attribute: 0.0
No of changes in col: fbs :  75
Mode of restecg attribute: 0.0
No of changes in col: restecg :  1
Mode of ekgmo attribute: 3.5
No of changes in col: ekgmo :  1
Mode of ekgday attribute: 17.0
No of changes in col: ekgday :  1
Mode of ekgyr attribute: 85.0
No of changes in col: ekgyr :  1
Mode of dig attribute: 0.0
No of changes in col: dig :  5
Mode of prop attribute: 0.0
No of changes in col: prop :  2
Mode of nitr attribute: 0.0
No of changes in col: nitr :  3
Mode of pro attribute: 0.0
No of changes in col: pro :  1
Mode of diuretic attribute: 0.0
No of changes in col: diuretic :  6
Mode of thaldur attribute: 7.0
No of changes in col: thaldur :  1
Mode of thalach attribute: 121.0
No of changes in col: thalach :  1
Mode of thalrest attribute: 70.0
No of changes in col: thalrest :  1
Mode of tpeakbps attribute: 162.5
No of changes i

In [27]:
# switzerland_median.to_csv('datasets/switzerland_large.csv', index=False)

# Hungarian Datafile

## Finding Errors in Data / Missing Data

In [28]:
# Opens hungarian.data, iterates lines, takes in data and stores as a list of rows
hungarian_datafile = open('raw_data/hungarian.data', 'r', errors='ignore')

hungarian_wrong_data_lines = list() # Stores datalines with incorrect size of elements
hungarian_wrong_name_lines = list() # Stores datalines with wrong name attribute

i = 1

for line in hungarian_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            hungarian_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            hungarian_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            hungarian_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            hungarian_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1

print("Current i value (no of lines): ", i)

hungarian_datafile.close()

print("Lines with wrong number of elements are: ")
print(hungarian_wrong_data_lines)

print("Lines with wrong name attribute: ")
print(hungarian_wrong_data_lines)

Current i value (no of lines):  2942
Lines with wrong number of elements are: 
[{'line_no': 2941, 'line_size': 0}]
Lines with wrong name attribute: 
[{'line_no': 2941, 'line_size': 0}]


## Inference

Here, we see that 1230 lines of data exist amounting to around 123 data points.

## Creating the Dataset

In [29]:
hungarian_datafile = open('raw_data/hungarian.data', 'r', errors='ignore')

hungarian_dataset = list()
hungarian_datarow = list()

i = 0

for line in hungarian_datafile:
    i += 1
    if i > 1230:
        break
    linecontent = line.split()
    hungarian_datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = hungarian_datarow.copy()
        if len(row_clone)== 76:
            hungarian_dataset.append(row_clone)
        else:
            print(row_clone)
        hungarian_datarow.clear()
        
hungarian_datafile.close()

print("No of datapoints: ", len(hungarian_dataset))

df = pd.DataFrame(hungarian_dataset, columns=headers)
# The first version of the dataset is a direct raw conversion of the .data file to .csv file
df.to_csv('datasets/hungarian_raw.csv', index=False) 

del df

No of datapoints:  123


In [30]:
hungarian_data = pd.read_csv('datasets/hungarian_raw.csv')
hungarian_data.head()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
0,1254,0,40,1,1,0,0,-9,2,140,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
1,1255,0,49,0,1,0,0,-9,3,160,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
2,1256,0,37,1,1,0,0,-9,2,130,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
3,1257,0,48,0,1,1,1,-9,4,138,...,2,-9,1,1,1,1,1,-9.0,-9.0,name
4,1258,0,54,1,1,0,1,-9,3,150,...,1,-9,1,1,1,1,1,-9.0,-9.0,name


In [31]:
hungarian_data.describe()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,...,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0
mean,1246.317073,0.0,47.105691,0.674797,0.95935,0.455285,0.544715,-9.0,2.99187,131.04878,...,-8.731707,-7.333333,-8.113821,1.0,1.01626,1.056911,1.447154,1.113821,-6.162602,-9.0
std,37.834479,0.0,8.176148,0.470367,0.198287,0.500033,0.500033,0.0,0.962374,15.865652,...,1.703772,3.91857,2.992349,0.0,0.180334,0.449058,1.537527,0.388424,13.090088,0.0
min,1183.0,0.0,31.0,0.0,0.0,0.0,0.0,-9.0,1.0,100.0,...,-9.0,-9.0,-9.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
25%,1213.5,0.0,40.5,0.0,1.0,0.0,0.0,-9.0,2.0,120.0,...,-9.0,-9.0,-9.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
50%,1244.0,0.0,48.0,1.0,1.0,0.0,1.0,-9.0,3.0,130.0,...,-9.0,-9.0,-9.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
75%,1279.5,0.0,53.0,1.0,1.0,1.0,1.0,-9.0,4.0,140.0,...,-9.0,-9.0,-9.0,1.0,1.0,1.0,1.0,1.0,-9.0,-9.0
max,1310.0,0.0,66.0,1.0,1.0,1.0,1.0,-9.0,4.0,190.0,...,2.0,2.0,2.0,1.0,3.0,5.0,8.0,4.0,67.0,-9.0


In [32]:
hungarian_data = hungarian_data.replace(-9, np.nan)

In [33]:
# Removing ID and Name columns as they are useless

hungarian_data = hungarian_data.drop(['id', 'name'], axis = 1)

In [34]:
# Removing columns with constant data

drop_cols = list()

for col in hungarian_data.columns.values:
    if hungarian_data[col].std() == 0:
        drop_cols.append(col)
        
print("Columns to be dropped: ", drop_cols)
print("No of rows to be dropped: ", len(drop_cols))

Columns to be dropped:  ['ccf', 'dm', 'dig', 'diuretic', 'om2', 'lvx1']
No of rows to be dropped:  6


In [35]:
# Removing columns with all nan values

for col in hungarian_data.columns.values:
    if np.isnan(hungarian_data[col].std()):
        drop_cols.append(col)
        
print(drop_cols)
print("No of rows to be dropped: ", len(drop_cols))

['ccf', 'dm', 'dig', 'diuretic', 'om2', 'lvx1', 'pncaden', 'cigs', 'years', 'famhist', 'ca', 'restckm', 'exerckm', 'restef', 'restwm', 'exeref', 'exerwm', 'earlobe', 'junk']
No of rows to be dropped:  19


In [36]:
# Dropping all unnecessary columns

hungarian_data = hungarian_data.drop(drop_cols, axis=1)

In [37]:
# Identifying columns with NaN values

cols_with_nan = hungarian_data.columns.values[hungarian_data.isna().any()]
display(cols_with_nan)

array(['chol', 'smoke', 'fbs', 'prop', 'nitr', 'pro', 'proto', 'thaldur',
       'thaltime', 'slope', 'thal', 'thalsev', 'thalpul', 'lmt',
       'ladprox', 'laddist', 'diag', 'cxmain', 'ramus', 'om1', 'rcaprox',
       'rcadist', 'cathef'], dtype=object)

In [38]:
nan_count = dict()

for col in cols_with_nan:
    nan_count[col] = hungarian_data[col].isna().sum()
    print('Column Name: ', col)
    print('NaN Count: ', nan_count[col])
    print(hungarian_data[col].value_counts())
    print("\n")

Column Name:  chol
NaN Count:  6
224.0    3
207.0    3
260.0    3
223.0    3
248.0    2
        ..
259.0    1
264.0    1
294.0    1
230.0    1
240.0    1
Name: chol, Length: 90, dtype: int64


Column Name:  smoke
NaN Count:  115
0.0    7
1.0    1
Name: smoke, dtype: int64


Column Name:  fbs
NaN Count:  3
0.0    110
1.0     10
Name: fbs, dtype: int64


Column Name:  prop
NaN Count:  2
0.0    117
1.0      4
Name: prop, dtype: int64


Column Name:  nitr
NaN Count:  1
0.0    109
1.0     13
Name: nitr, dtype: int64


Column Name:  pro
NaN Count:  1
0.0    111
1.0     11
Name: pro, dtype: int64


Column Name:  proto
NaN Count:  3
75.0     34
100.0    26
125.0    19
50.0     14
150.0    12
25.0      9
175.0     4
200.0     2
Name: proto, dtype: int64


Column Name:  thaldur
NaN Count:  1
10.0    14
7.0     12
9.0     12
12.0     9
13.0     8
8.0      7
6.0      7
14.0     6
11.0     6
4.0      5
3.0      4
18.0     4
15.0     4
19.0     4
2.0      4
17.0     3
16.0     3
5.0      2
20.0     

In [39]:
print("No of rows with nan : ", sum([True for index,row in hungarian_data.iterrows() if any(row.isnull())]))

No of rows with nan :  123


## Filling NaN with Other Values

Here we fill NaN with other values in the column using probability of finding the value in the column as the criteria

In [40]:
cols_to_be_removed = list()

for col in cols_with_nan:
    if hungarian_data[col].isna().sum() > 0.25*hungarian_data.shape[0] and col not in selected_columns:
        cols_to_be_removed.append(col)
        
print(cols_to_be_removed)

['smoke', 'thaltime', 'thalsev', 'thalpul', 'lmt', 'ladprox', 'laddist', 'diag', 'cxmain', 'ramus', 'om1', 'rcaprox', 'rcadist', 'cathef']


In [41]:
# Dropping these columns

hungarian_data = hungarian_data.drop(cols_to_be_removed, axis = 1)

In [42]:
hungarian_ratio = hungarian_data.copy()
hungarian_mean = hungarian_data.copy()
hungarian_mode = hungarian_data.copy()
hungarian_median = hungarian_data.copy()

In [43]:
print("No of rows with nan : ", sum([True for index,row in hungarian_data.iterrows() if any(row.isnull())]))

No of rows with nan :  117


## Imputing with proportion of each value in dataset

In [44]:
cols_with_nan = hungarian_ratio.columns.values[hungarian_ratio.isna().any()]

for col in cols_with_nan:
    print("Col Name: ", col, "")
    
    col_duplicate = hungarian_ratio[col].copy()
    
    for index, val in col_duplicate.iteritems():
        if np.isnan(val):
            print(index, val)
        
    col_probabilities = []
        
    for index, val in col_duplicate.value_counts().iteritems():
        col_probabilities.append(round(val/col_duplicate.value_counts().sum(), 2))
    
    for i in range(len(col_probabilities)):
        if round(sum(col_probabilities), 2) >= 1:
            break
        col_probabilities[i] += 0.01
    
    for i in range(len(col_probabilities)):
        if round(sum(col_probabilities), 2) <= 1:
            break
        col_probabilities[i] -= 0.01
    
    col_values = col_duplicate.value_counts().index.values.tolist()
        
    col_duplicate = col_duplicate.fillna(pd.Series(np.random.choice(col_values, 
                                                                      p=col_probabilities, size=len(col_duplicate))))
        
    for index, val in hungarian_ratio[col].iteritems():
        if col_duplicate[index] != val:
            print("Index: ", index, ", ", col, " duplicate value: ", col_duplicate[index], ", original value: ", val)
            
    hungarian_ratio[col] = col_duplicate.copy()
    
    print(col, "done!!!\n")

Col Name:  chol 
4 nan
31 nan
38 nan
61 nan
85 nan
90 nan
Index:  4 ,  chol  duplicate value:  297.0 , original value:  nan
Index:  31 ,  chol  duplicate value:  201.0 , original value:  nan
Index:  38 ,  chol  duplicate value:  206.0 , original value:  nan
Index:  61 ,  chol  duplicate value:  291.0 , original value:  nan
Index:  85 ,  chol  duplicate value:  207.0 , original value:  nan
Index:  90 ,  chol  duplicate value:  248.0 , original value:  nan
chol done!!!

Col Name:  fbs 
28 nan
75 nan
103 nan
Index:  28 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  75 ,  fbs  duplicate value:  0.0 , original value:  nan
Index:  103 ,  fbs  duplicate value:  0.0 , original value:  nan
fbs done!!!

Col Name:  prop 
30 nan
40 nan
Index:  30 ,  prop  duplicate value:  0.0 , original value:  nan
Index:  40 ,  prop  duplicate value:  0.0 , original value:  nan
prop done!!!

Col Name:  nitr 
40 nan
Index:  40 ,  nitr  duplicate value:  0.0 , original value:  nan
nitr done!!!

Col N

Index:  25 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  26 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  27 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  28 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  29 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  30 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  31 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  32 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  33 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  34 ,  thal  duplicate value:  7.0 , original value:  nan
Index:  35 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  36 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  37 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  38 ,  thal  duplicate value:  6.0 , original value:  nan
Index:  39 ,  thal  duplicate value:  3.0 , original value:  nan
Index:  40 ,  thal  dupli

In [45]:
print("No of rows with nan : ", sum([True for index,row in hungarian_ratio.iterrows() if any(row.isnull())]))

No of rows with nan :  0


In [46]:
hungarian_ratio.to_csv('datasets/hungarian_large.csv', index=False)

## Replacing with Mean

In [47]:
for col in cols_with_nan:
    col_mean_value = hungarian_mean[col].mean()
    print("Mean of", col, "attribute:", col_mean_value)
    hungarian_mean[col].fillna(value = col_mean_value, inplace = True)
    comparison_array = np.where(hungarian_mean[col] == hungarian_data[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mean of chol attribute: 246.16239316239316
No of changes in col: chol :  6
Mean of fbs attribute: 0.08333333333333333
No of changes in col: fbs :  3
Mean of prop attribute: 0.03305785123966942
No of changes in col: prop :  2
Mean of nitr attribute: 0.10655737704918032
No of changes in col: nitr :  1
Mean of pro attribute: 0.09016393442622951
No of changes in col: pro :  1
Mean of proto attribute: 94.58333333333333
No of changes in col: proto :  3
Mean of thaldur attribute: 10.37704918032787
No of changes in col: thaldur :  1
Mean of slope attribute: 1.9767441860465116
No of changes in col: slope :  80
Mean of thal attribute: 5.125
No of changes in col: thal :  107


In [48]:
# hungarian_mean.to_csv('datasets/hungarian_large.csv', index=False)

## Replacing with Mode

In [49]:
for col in cols_with_nan:
    col_mode_value = hungarian_mode[col].mode()
    print("Mode of", col, "attribute:", col_mode_value)
    hungarian_mode[col].fillna(value = col_mode_value, inplace = True)
    comparison_array = np.where(hungarian_mode[col] == hungarian_data[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of chol attribute: 0    207.0
1    223.0
2    224.0
3    260.0
dtype: float64
No of changes in col: chol :  6
Mode of fbs attribute: 0    0.0
dtype: float64
No of changes in col: fbs :  3
Mode of prop attribute: 0    0.0
dtype: float64
No of changes in col: prop :  2
Mode of nitr attribute: 0    0.0
dtype: float64
No of changes in col: nitr :  1
Mode of pro attribute: 0    0.0
dtype: float64
No of changes in col: pro :  1
Mode of proto attribute: 0    75.0
dtype: float64
No of changes in col: proto :  3
Mode of thaldur attribute: 0    10.0
dtype: float64
No of changes in col: thaldur :  1
Mode of slope attribute: 0    2.0
dtype: float64
No of changes in col: slope :  80
Mode of thal attribute: 0    3.0
1    6.0
dtype: float64
No of changes in col: thal :  107


In [50]:
# hungarian_mode.to_csv('datasets/hungarian_large.csv', index=False)

## Replacing with Median

In [51]:
for col in cols_with_nan:
    col_median_value = hungarian_median[col].median()
    print("Mode of", col, "attribute:", col_median_value)
    hungarian_median[col].fillna(value = col_median_value, inplace = True)
    comparison_array = np.where(hungarian_median[col] == hungarian_data[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of chol attribute: 234.0
No of changes in col: chol :  6
Mode of fbs attribute: 0.0
No of changes in col: fbs :  3
Mode of prop attribute: 0.0
No of changes in col: prop :  2
Mode of nitr attribute: 0.0
No of changes in col: nitr :  1
Mode of pro attribute: 0.0
No of changes in col: pro :  1
Mode of proto attribute: 100.0
No of changes in col: proto :  3
Mode of thaldur attribute: 10.0
No of changes in col: thaldur :  1
Mode of slope attribute: 2.0
No of changes in col: slope :  80
Mode of thal attribute: 6.0
No of changes in col: thal :  107


In [52]:
# hungarian_median.to_csv('datasets/hungarian_large.csv', index=False)

# Long Beach Datafile

## Finding Errors in Data / Missing Data

In [53]:
# Opens long-beach-va.data, iterates lines, takes in data and stores as a list of rows
longbeach_datafile = open('raw_data/long-beach-va.data', 'r', errors='ignore')

longbeach_wrong_data_lines = list() # Stores datalines with incorrect size of elements
longbeach_wrong_name_lines = list() # Stores datalines with wrong name attribute

i = 1

for line in longbeach_datafile:
    linecontent = line.split()
    if i % 10 == 1:
        if len(linecontent) != 7:
            longbeach_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 in range(2, 10):
        if len(linecontent) != 8:
            longbeach_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
    elif i%10 == 0:
        if len(linecontent) != 5:
            longbeach_wrong_data_lines.append({'line_no': i, 'line_size': len(linecontent)})
        if linecontent[-1] != 'name':
            longbeach_wrong_name_lines.append({'line_no': i, 'line_size': len(linecontent)})
    i += 1

print("Current i value (no of lines): ", i)

longbeach_datafile.close()

print("Lines with wrong number of elements are: ")
print(longbeach_wrong_name_lines)

print("Lines with wrong name attribute: ")
print(longbeach_wrong_data_lines)

Current i value (no of lines):  2002
Lines with wrong number of elements are: 
[]
Lines with wrong name attribute: 
[{'line_no': 2001, 'line_size': 0}]


## Inference

Here, we see that 1230 lines of data exist amounting to around 123 data points.

## Creating the Dataset

In [54]:
longbeach_datafile = open('raw_data/long-beach-va.data', 'r', errors='ignore')

longbeach_dataset = list()
longbeach_datarow = list()

i = 0

for line in longbeach_datafile:
    i += 1
    if i > 1230:
        break
    linecontent = line.split()
    longbeach_datarow.extend(linecontent)
    if i % 10 == 0:
        row_clone = longbeach_datarow.copy()
        if len(row_clone)== 76:
            longbeach_dataset.append(row_clone)
        else:
            print(row_clone)
        longbeach_datarow.clear()
        
longbeach_datafile.close()

print("No of datapoints: ", len(longbeach_dataset))

df = pd.DataFrame(longbeach_dataset, columns=headers)
# The first version of the dataset is a direct raw conversion of the .data file to .csv file
df.to_csv('datasets/longbeach_raw.csv', index=False) 

del df

No of datapoints:  123


In [55]:
longbeach_data = pd.read_csv('datasets/longbeach_raw.csv')
longbeach_data.head()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
0,1,0,63,1,1,1,1,-9,4,140,...,2,1,1,1,1,1,1,0.7,5.5,name
1,2,0,44,1,1,1,1,-9,4,130,...,1,1,1,1,1,1,1,0.5,-9.0,name
2,3,0,60,1,1,1,1,-9,4,132,...,2,1,1,1,1,7,2,0.52,4.1,name
3,4,0,55,1,1,1,1,-9,4,142,...,1,1,1,1,1,1,1,0.73,6.5,name
4,5,0,66,1,1,0,0,-9,3,110,...,1,1,1,1,1,1,1,0.73,8.0,name


In [56]:
longbeach_data.describe()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,...,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0
mean,62.146341,0.0,59.081301,0.98374,0.943089,0.666667,0.626016,-9.0,3.520325,95.96748,...,1.056911,1.422764,1.154472,1.0,1.097561,1.178862,2.195122,1.235772,0.275528,0.560163
std,35.895496,0.0,7.600252,0.126992,0.232619,0.473333,1.564977,0.0,0.771942,64.134456,...,0.232619,0.496019,0.362878,0.0,0.823971,0.713253,2.260344,0.702236,1.922179,7.29738
min,1.0,0.0,35.0,0.0,0.0,0.0,-9.0,-9.0,1.0,-9.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-9.0,-9.0
25%,31.5,0.0,55.0,1.0,1.0,0.0,1.0,-9.0,3.0,-4.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.57,-9.0
50%,62.0,0.0,60.0,1.0,1.0,1.0,1.0,-9.0,4.0,124.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7,4.4
75%,92.5,0.0,63.0,1.0,1.0,1.0,1.0,-9.0,4.0,139.0,...,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.775,6.0
max,126.0,0.0,77.0,1.0,1.0,1.0,1.0,-9.0,4.0,180.0,...,2.0,2.0,2.0,1.0,10.0,5.0,7.0,5.0,0.89,11.3


In [57]:
longbeach_data = longbeach_data.replace(-9, np.nan)

In [58]:
# Removing ID and Name columns as they are useless

longbeach_data = longbeach_data.drop(['id', 'name'], axis = 1)

In [59]:
# Removing columns with constant data

drop_cols = list()

for col in longbeach_data.columns.values:
    if longbeach_data[col].std() == 0:
        drop_cols.append(col)
        
print("Columns to be dropped: ", drop_cols)
print("No of rows to be dropped: ", len(drop_cols))

Columns to be dropped:  ['ccf', 'ca', 'lvx1']
No of rows to be dropped:  3


In [60]:
# Removing columns with all nan values

for col in longbeach_data.columns.values:
    if np.isnan(longbeach_data[col].std()):
        drop_cols.append(col)
        
print(drop_cols)
print("No of rows to be dropped: ", len(drop_cols))

['ccf', 'ca', 'lvx1', 'pncaden', 'restckm', 'exerckm', 'thalpul', 'earlobe']
No of rows to be dropped:  8


In [61]:
# Dropping all unnecessary columns

longbeach_data = longbeach_data.drop(drop_cols, axis=1)

In [62]:
# Identifying columns with NaN values

cols_with_nan = longbeach_data.columns.values[longbeach_data.isna().any()]
display(cols_with_nan)

array(['relrest', 'trestbps', 'htn', 'chol', 'smoke', 'cigs', 'years',
       'fbs', 'dm', 'famhist', 'ekgmo', 'ekgday', 'ekgyr', 'dig', 'prop',
       'nitr', 'pro', 'diuretic', 'proto', 'thaldur', 'thaltime', 'met',
       'thalach', 'thalrest', 'tpeakbps', 'tpeakbpd', 'dummy', 'trestbpd',
       'exang', 'xhypo', 'oldpeak', 'slope', 'rldv5', 'rldv5e', 'restef',
       'restwm', 'exeref', 'exerwm', 'thal', 'thalsev', 'cathef', 'junk'],
      dtype=object)

In [63]:
nan_count = dict()

for col in cols_with_nan:
    nan_count[col] = longbeach_data[col].isna().sum()
    print('Column Name: ', col)
    print('NaN Count: ', nan_count[col])
    print(longbeach_data[col].value_counts())
    print("\n")

Column Name:  relrest
NaN Count:  3
1.0    104
0.0     16
Name: relrest, dtype: int64


Column Name:  trestbps
NaN Count:  31
120.0    12
130.0    10
110.0     7
140.0     5
150.0     5
136.0     4
142.0     4
160.0     4
126.0     4
170.0     3
124.0     3
122.0     3
128.0     3
152.0     2
144.0     2
154.0     2
125.0     2
132.0     2
116.0     2
146.0     1
102.0     1
112.0     1
106.0     1
158.0     1
135.0     1
134.0     1
178.0     1
138.0     1
104.0     1
180.0     1
0.0       1
96.0      1
Name: trestbps, dtype: int64


Column Name:  htn
NaN Count:  3
0.0    61
1.0    59
Name: htn, dtype: int64


Column Name:  chol
NaN Count:  1
0.0      49
218.0     3
236.0     3
214.0     3
186.0     2
252.0     2
240.0     2
195.0     2
203.0     2
258.0     2
220.0     2
267.0     2
228.0     2
210.0     1
235.0     1
245.0     1
225.0     1
198.0     1
227.0     1
208.0     1
161.0     1
260.0     1
349.0     1
305.0     1
223.0     1
282.0     1
248.0     1
160.0     1
312.0     1


In [64]:
print("No of rows with nan : ", sum([True for index,row in longbeach_data.iterrows() if any(row.isnull())]))

No of rows with nan :  123


## Filling NaN with Other Values

Here we fill NaN with other values in the column using probability of finding the value in the column as the criteria

In [65]:
cols_to_be_removed = list()

for col in cols_with_nan:
    if longbeach_data[col].isna().sum() > 0.25*longbeach_data.shape[0] and col not in selected_columns:
        cols_to_be_removed.append(col)
        
print(cols_to_be_removed)

['dm', 'dig', 'prop', 'nitr', 'pro', 'diuretic', 'thaltime', 'tpeakbps', 'tpeakbpd', 'dummy', 'trestbpd', 'rldv5', 'rldv5e', 'restef', 'restwm', 'exeref', 'exerwm', 'thalsev', 'junk']


In [66]:
# Dropping these columns

longbeach_data = longbeach_data.drop(cols_to_be_removed, axis = 1)

In [67]:
longbeach_ratio = longbeach_data.copy()
longbeach_mean = longbeach_data.copy()
longbeach_mode = longbeach_data.copy()
longbeach_median = longbeach_data.copy()

In [68]:
print("No of rows with nan : ", sum([True for index,row in longbeach_data.iterrows() if any(row.isnull())]))

No of rows with nan :  111


## Imputing with proportion of each value in dataset

In [69]:
cols_with_nan = longbeach_ratio.columns.values[longbeach_ratio.isna().any()]

for col in cols_with_nan:
    print("Col Name: ", col, "")
    
    col_duplicate = longbeach_ratio[col].copy()
    
    for index, val in col_duplicate.iteritems():
        if np.isnan(val):
            print(index, val)
        
    col_probabilities = []
        
    for index, val in col_duplicate.value_counts().iteritems():
        col_probabilities.append(round(val/col_duplicate.value_counts().sum(), 2))
    
    for i in range(len(col_probabilities)):
        if round(sum(col_probabilities), 2) >= 1:
            break
        col_probabilities[i] += 0.01
    
    for i in range(len(col_probabilities)):
        if round(sum(col_probabilities), 2) <= 1:
            break
        col_probabilities[i] -= 0.01
    
    col_values = col_duplicate.value_counts().index.values.tolist()
        
    col_duplicate = col_duplicate.fillna(pd.Series(np.random.choice(col_values, 
                                                                      p=col_probabilities, size=len(col_duplicate))))
        
    for index, val in longbeach_ratio[col].iteritems():
        if col_duplicate[index] != val:
            print("Index: ", index, ", ", col, " duplicate value: ", col_duplicate[index], ", original value: ", val)
            
    longbeach_ratio[col] = col_duplicate.copy()
    
    print(col, "done!!!\n")

Col Name:  relrest 
77 nan
89 nan
99 nan
Index:  77 ,  relrest  duplicate value:  1.0 , original value:  nan
Index:  89 ,  relrest  duplicate value:  0.0 , original value:  nan
Index:  99 ,  relrest  duplicate value:  1.0 , original value:  nan
relrest done!!!

Col Name:  trestbps 
13 nan
23 nan
26 nan
29 nan
30 nan
36 nan
38 nan
42 nan
44 nan
48 nan
51 nan
57 nan
58 nan
60 nan
68 nan
69 nan
72 nan
77 nan
87 nan
89 nan
94 nan
96 nan
109 nan
110 nan
111 nan
112 nan
113 nan
114 nan
115 nan
119 nan
120 nan
Index:  13 ,  trestbps  duplicate value:  154.0 , original value:  nan
Index:  23 ,  trestbps  duplicate value:  120.0 , original value:  nan
Index:  26 ,  trestbps  duplicate value:  116.0 , original value:  nan
Index:  29 ,  trestbps  duplicate value:  120.0 , original value:  nan
Index:  30 ,  trestbps  duplicate value:  120.0 , original value:  nan
Index:  36 ,  trestbps  duplicate value:  124.0 , original value:  nan
Index:  38 ,  trestbps  duplicate value:  124.0 , original value:

Index:  13 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  23 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  26 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  29 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  30 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  36 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  38 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  44 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  51 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  57 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  58 ,  proto  duplicate value:  1.0 , original value:  nan
Index:  60 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  68 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  69 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  72 ,  proto  duplicate value:  5.0 , original value:  nan
Index:  77

Index:  13 ,  thalrest  duplicate value:  84.0 , original value:  nan
Index:  23 ,  thalrest  duplicate value:  68.0 , original value:  nan
Index:  26 ,  thalrest  duplicate value:  72.0 , original value:  nan
Index:  29 ,  thalrest  duplicate value:  65.0 , original value:  nan
Index:  30 ,  thalrest  duplicate value:  65.0 , original value:  nan
Index:  36 ,  thalrest  duplicate value:  62.0 , original value:  nan
Index:  38 ,  thalrest  duplicate value:  66.0 , original value:  nan
Index:  44 ,  thalrest  duplicate value:  95.0 , original value:  nan
Index:  51 ,  thalrest  duplicate value:  75.0 , original value:  nan
Index:  57 ,  thalrest  duplicate value:  77.0 , original value:  nan
Index:  58 ,  thalrest  duplicate value:  70.0 , original value:  nan
Index:  60 ,  thalrest  duplicate value:  75.0 , original value:  nan
Index:  68 ,  thalrest  duplicate value:  84.0 , original value:  nan
Index:  69 ,  thalrest  duplicate value:  73.0 , original value:  nan
Index:  72 ,  thalre

Index:  116 ,  slope  duplicate value:  3.0 , original value:  nan
Index:  119 ,  slope  duplicate value:  2.0 , original value:  nan
Index:  120 ,  slope  duplicate value:  1.0 , original value:  nan
slope done!!!

Col Name:  thal 
0 nan
1 nan
2 nan
3 nan
4 nan
6 nan
7 nan
8 nan
9 nan
10 nan
11 nan
13 nan
14 nan
15 nan
16 nan
17 nan
18 nan
20 nan
21 nan
22 nan
23 nan
26 nan
29 nan
30 nan
31 nan
32 nan
33 nan
35 nan
36 nan
38 nan
39 nan
40 nan
44 nan
46 nan
47 nan
48 nan
49 nan
51 nan
53 nan
54 nan
55 nan
56 nan
57 nan
58 nan
60 nan
61 nan
62 nan
63 nan
64 nan
65 nan
66 nan
68 nan
69 nan
70 nan
72 nan
73 nan
74 nan
76 nan
77 nan
78 nan
80 nan
81 nan
82 nan
83 nan
84 nan
85 nan
86 nan
87 nan
88 nan
89 nan
90 nan
92 nan
93 nan
94 nan
95 nan
96 nan
97 nan
98 nan
99 nan
101 nan
102 nan
103 nan
104 nan
107 nan
108 nan
109 nan
110 nan
111 nan
112 nan
113 nan
114 nan
115 nan
116 nan
117 nan
118 nan
119 nan
120 nan
121 nan
122 nan
Index:  0 ,  thal  duplicate value:  5.0 , original value:  nan

In [70]:
print("No of rows with nan : ", sum([True for index,row in longbeach_ratio.iterrows() if any(row.isnull())]))

No of rows with nan :  0


In [71]:
longbeach_ratio.to_csv('datasets/longbeach_large.csv', index=False)

## Replacing with Mean

In [72]:
for col in cols_with_nan:
    col_mean_value = longbeach_mean[col].mean()
    print("Mean of", col, "attribute:", col_mean_value)
    longbeach_mean[col].fillna(value = col_mean_value, inplace = True)
    comparison_array = np.where(longbeach_mean[col] == longbeach_data[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mean of relrest attribute: 0.8666666666666667
No of changes in col: relrest :  3
Mean of trestbps attribute: 131.33695652173913
No of changes in col: trestbps :  31
Mean of htn attribute: 0.49166666666666664
No of changes in col: htn :  3
Mean of chol attribute: 142.48360655737704
No of changes in col: chol :  1
Mean of smoke attribute: 0.48739495798319327
No of changes in col: smoke :  4
Mean of cigs attribute: 23.74137931034483
No of changes in col: cigs :  7
Mean of years attribute: 22.035087719298247
No of changes in col: years :  9
Mean of fbs attribute: 0.3103448275862069
No of changes in col: fbs :  7
Mean of famhist attribute: 0.47413793103448276
No of changes in col: famhist :  7
Mean of ekgmo attribute: 6.294736842105263
No of changes in col: ekgmo :  28
Mean of ekgday attribute: 16.382978723404257
No of changes in col: ekgday :  29
Mean of ekgyr attribute: 85.06315789473685
No of changes in col: ekgyr :  28
Mean of proto attribute: 4.648936170212766
No of changes in col: pro

In [73]:
# longbeach_mean.to_csv('datasets/longbeach_large.csv', index=False)

## Replacing with Mode

In [74]:
for col in cols_with_nan:
    col_mode_value = longbeach_mode[col].mode()
    print("Mode of", col, "attribute:", col_mode_value)
    longbeach_mode[col].fillna(value = col_mode_value, inplace = True)
    comparison_array = np.where(longbeach_mode[col] == longbeach_data[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of relrest attribute: 0    1.0
dtype: float64
No of changes in col: relrest :  3
Mode of trestbps attribute: 0    120.0
dtype: float64
No of changes in col: trestbps :  31
Mode of htn attribute: 0    0.0
dtype: float64
No of changes in col: htn :  3
Mode of chol attribute: 0    0.0
dtype: float64
No of changes in col: chol :  1
Mode of smoke attribute: 0    0.0
dtype: float64
No of changes in col: smoke :  4
Mode of cigs attribute: 0    20.0
dtype: float64
No of changes in col: cigs :  7
Mode of years attribute: 0    0.0
dtype: float64
No of changes in col: years :  9
Mode of fbs attribute: 0    0.0
dtype: float64
No of changes in col: fbs :  7
Mode of famhist attribute: 0    0.0
dtype: float64
No of changes in col: famhist :  7
Mode of ekgmo attribute: 0     4.0
1    10.0
dtype: float64
No of changes in col: ekgmo :  28
Mode of ekgday attribute: 0    20.0
1    23.0
dtype: float64
No of changes in col: ekgday :  29
Mode of ekgyr attribute: 0    86.0
dtype: float64
No of changes in

In [75]:
# longbeach_mode.to_csv('datasets/longbeach_large.csv', index=False)

## Replacing with Median

In [76]:
for col in cols_with_nan:
    col_median_value = longbeach_median[col].median()
    print("Mode of", col, "attribute:", col_median_value)
    longbeach_median[col].fillna(value = col_median_value, inplace = True)
    comparison_array = np.where(longbeach_median[col] == longbeach_data[col], 0, 1)
    print("No of changes in col:", col, ": ", np.sum(comparison_array))

Mode of relrest attribute: 1.0
No of changes in col: relrest :  3
Mode of trestbps attribute: 130.0
No of changes in col: trestbps :  31
Mode of htn attribute: 0.0
No of changes in col: htn :  3
Mode of chol attribute: 196.5
No of changes in col: chol :  1
Mode of smoke attribute: 0.0
No of changes in col: smoke :  4
Mode of cigs attribute: 20.0
No of changes in col: cigs :  7
Mode of years attribute: 24.0
No of changes in col: years :  9
Mode of fbs attribute: 0.0
No of changes in col: fbs :  7
Mode of famhist attribute: 0.0
No of changes in col: famhist :  7
Mode of ekgmo attribute: 6.0
No of changes in col: ekgmo :  28
Mode of ekgday attribute: 17.5
No of changes in col: ekgday :  29
Mode of ekgyr attribute: 85.0
No of changes in col: ekgyr :  28
Mode of proto attribute: 5.0
No of changes in col: proto :  29
Mode of thaldur attribute: 6.15
No of changes in col: thaldur :  29
Mode of met attribute: 6.0
No of changes in col: met :  29
Mode of thalach attribute: 122.5
No of changes in 

In [77]:
# longbeach_median.to_csv('datasets/longbeach_large.csv', index=False)