In [1]:
import os
import pandas as pd
import numpy as np
import warnings

In [2]:
warnings.filterwarnings('ignore') #To ignore un-necessary warning

### Extracting Files from Diabetes-data Folder

In [3]:
file_list = os.listdir('diabetes-data')
file_list_filtered = [s for s in file_list if s.startswith('d')]
file_list_filtered[:5]

['data-01', 'data-02', 'data-03', 'data-04', 'data-05']

### Reading the files and appending them in the form of Dataframe

In [4]:
df = None

for data_file in file_list_filtered:
    #print(data_file)
    
    temp_df = pd.read_table('diabetes-data/'+data_file,header = None)
    temp_df['data-file'] = data_file
    
    if df is None :
        df = temp_df
    
    else :
        df = df.append(temp_df,ignore_index = True)

In [5]:
df.head() #Checking the dataframe

Unnamed: 0,0,1,2,3,data-file
0,04-21-1991,9:09,58,100,data-01
1,04-21-1991,9:09,33,9,data-01
2,04-21-1991,9:09,34,13,data-01
3,04-21-1991,17:08,62,119,data-01
4,04-21-1991,17:08,33,7,data-01


In [6]:
df.isna().sum                     #Checking for null values in each feature

0            33
1             0
2             0
3            33
data-file     0
dtype: int64

In [7]:
(df.count()/len(df)) * 100         #percentage of non-null values in each feature

0             99.887487
1            100.000000
2            100.000000
3             99.887487
data-file    100.000000
dtype: float64

In [8]:
df = df.dropna()                    #Dropping null values from each feature of the dataset
df.isna().sum()

0            0
1            0
2            0
3            0
data-file    0
dtype: int64

In [9]:
df.count()                           # Number of non-null values in each feature of the dataset

0            29264
1            29264
2            29264
3            29264
data-file    29264
dtype: int64

In [10]:
'''
Combining date and time attribute to form a single attribute to later convert it into timestamp.
And then dropped columns for date and time from the dataframe. 
'''
df['timestamp'] = df[0] +" "+ df[1]
df.drop(columns=[0,1],inplace=True)
df.head()

Unnamed: 0,2,3,data-file,timestamp
0,58,100,data-01,04-21-1991 9:09
1,33,9,data-01,04-21-1991 9:09
2,34,13,data-01,04-21-1991 9:09
3,62,119,data-01,04-21-1991 17:08
4,33,7,data-01,04-21-1991 17:08


In [11]:
df.rename(columns={2:'code',3:'value'},inplace=True) # Renaming columns to the names given in Readme.md file of diabetes-dataset
df.head()

Unnamed: 0,code,value,data-file,timestamp
0,58,100,data-01,04-21-1991 9:09
1,33,9,data-01,04-21-1991 9:09
2,34,13,data-01,04-21-1991 9:09
3,62,119,data-01,04-21-1991 17:08
4,33,7,data-01,04-21-1991 17:08


In [12]:
# Mapping code values to their meaning as mentioned in Readme.md
decipher = {
    
    33 : 'Regular insulin dose',
    34 : 'NPH insulin dose',
    35 : 'UltraLente insulin dose',
    48 : 'Unspecified blood glucose measurement',
    57 : 'Unspecified blood glucose measurement',
    58 : 'Pre-breakfast blood glucose measurement',
    59 : 'Post-breakfast blood glucose measurement',
    60 : 'Pre-lunch blood glucose measurement',
    61 : 'Post-lunch blood glucose measurement',
    62 : 'Pre-supper blood glucose measurement',
    63 : 'Post-supper blood glucose measurement',
    64 : 'Pre-snack blood glucose measurement',
    65 : 'Hypoglycemic symptoms',
    66 : 'Typical meal ingestion',
    67 : 'More-than-usual meal ingestion',
    68 : 'Less-than-usual meal ingestion',
    69 : 'Typical exercise activity',
    70 : 'More-than-usual exercise activity',
    71 : 'Less-than-usual exercise activity',
    72 : 'Unspecified special event',
}

In [13]:
#Replacing codes with deciphered values....

df['code'] = df['code'].map(lambda x : decipher[x] if x in decipher.keys() else None)
df.head()

Unnamed: 0,code,value,data-file,timestamp
0,Pre-breakfast blood glucose measurement,100,data-01,04-21-1991 9:09
1,Regular insulin dose,9,data-01,04-21-1991 9:09
2,NPH insulin dose,13,data-01,04-21-1991 9:09
3,Pre-supper blood glucose measurement,119,data-01,04-21-1991 17:08
4,Regular insulin dose,7,data-01,04-21-1991 17:08


In [14]:
df.isna().sum()             #Checking for null values in each feature

code         121
value          0
data-file      0
timestamp      0
dtype: int64

In [15]:
df.dropna(inplace=True)         #Dropping null values from each feature of the dataset
df.isna().sum()   #Checking for null values in each feature

code         0
value        0
data-file    0
timestamp    0
dtype: int64

In [16]:
df.count()                     # Number of non-null values in each feature of the dataset

code         29143
value        29143
data-file    29143
timestamp    29143
dtype: int64

In [17]:
'''
Converting string timestamp to datetime object
and then converting it to the posix time..
'''
df['timestamp'] = pd.to_datetime(df['timestamp'],errors='coerce')
df['timestamp'] = df['timestamp'].map(lambda x : (x - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s'))
df.head()

Unnamed: 0,code,value,data-file,timestamp
0,Pre-breakfast blood glucose measurement,100,data-01,672224940.0
1,Regular insulin dose,9,data-01,672224940.0
2,NPH insulin dose,13,data-01,672224940.0
3,Pre-supper blood glucose measurement,119,data-01,672253680.0
4,Regular insulin dose,7,data-01,672253680.0


In [18]:
df.sort_values(by='timestamp',ignore_index=True,inplace=True) # Sorting dataframe according to timestamp in asc order...
df.head()

Unnamed: 0,code,value,data-file,timestamp
0,NPH insulin dose,20,data-68,575452800.0
1,Pre-breakfast blood glucose measurement,134,data-68,575452800.0
2,Pre-lunch blood glucose measurement,158,data-68,575467200.0
3,Pre-supper blood glucose measurement,258,data-68,575488800.0
4,Pre-breakfast blood glucose measurement,115,data-68,575539200.0


In [19]:
'''
Dropping data-file feature and 
rearranging other feature in the
dataframe
'''
df.drop(columns='data-file')
df = df[['value','code','timestamp']]
df.head()

Unnamed: 0,value,code,timestamp
0,20,NPH insulin dose,575452800.0
1,134,Pre-breakfast blood glucose measurement,575452800.0
2,158,Pre-lunch blood glucose measurement,575467200.0
3,258,Pre-supper blood glucose measurement,575488800.0
4,115,Pre-breakfast blood glucose measurement,575539200.0


In [20]:
#df.to_csv('./datasets/prefinal.csv') # Saving this DataFrame as csv file