In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


df = pd.read_csv('../data/diabetic_readmission_data.csv')
df.head(10)

In [None]:
# There are 101,766 rows and 50 columns
print(df.shape)

In [None]:
# Examine data, there are some hidden columns
# show all columns
df.columns

In [None]:
# Because there are many of hidden columns & hidden rows,
#in order to have an overview of dataframe, try to see through all of them
#there are a lot missing value columns & cells
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
df.head(15).T

In [None]:
print(df.info())

In [None]:
# examine the common stats of numeric columns
print(df.describe())

In [None]:
# examine the bool & object with describe
df.describe(include=['object', 'bool'])

In [None]:
# loop through the data of each column, find the column has value of cell which is object =='?' then count

for col in df.columns:
    if df[col].dtype == object:
         print(col, df[col][df[col] == '?'].count())

# There are 101,766 rows and 50 columns
print(df.shape)

In [None]:
# Because 'readmitted' is a very important feature to look through therefore need to check to see all possible value of data in column
df['readmitted'].unique()

In [None]:
# Because gender is a important feature to look through therefore need to check to see all possible value of data in column
df['gender'].unique()

In [None]:
# to count how many missing data cell in gender column
print('gender', df['gender'][df['gender'] == 'Unknown/Invalid'].count())

In [None]:
# Because weight, payer_code, medical_specialty missing value are 96%, 40% & 49% accordingly.
# Besides, I don't see they are really useful for analysis this secenario, dicided to delete them.
df.drop(['weight', 'payer_code', 'medical_specialty'], axis=1, inplace = True)
df

In [None]:
# There is no duplicated value in column patient_nbr
df[df.duplicated(keep=False)].sort_values(by='patient_nbr')

In [None]:
##### Because 'readmitted' is a very important feature to look through therefore need to check to see all possible value of data in column
reamitted_values = df['readmitted'].unique()
print(reamitted_values)

In [None]:
# examine the bool & object with describe
df.describe(include=['object', 'bool'])

In [None]:
#Recheck
df['repaglinide'].unique()

In [None]:
df['citoglipton'].unique()

In [None]:
df['examide'].unique()

In [None]:
df['max_glu_serum'].unique()

In [None]:
df['A1Cresult'].unique()

In [None]:
#since those columns' values are all 'No', we can drop them
df = df.drop(['citoglipton', 'examide'], axis = 1, inplace = True)

In [None]:
#add new feature age_under
#replace all instances of grouped age with the last value in the group.
df['age'].replace(to_replace = '[0-10)', value = 10, inplace=True)
df['age'].replace(to_replace = '[10-20)', value = 20, inplace=True)
df['age'].replace(to_replace = '[20-30)', value = 30, inplace=True)
df['age'].replace(to_replace = '[30-40)', value = 40, inplace=True)
df['age'].replace(to_replace = '[40-50)', value = 50, inplace=True)
df['age'].replace(to_replace = '[50-60)', value = 60, inplace=True)
df['age'].replace(to_replace = '[60-70)', value = 70, inplace=True)
df['age'].replace(to_replace = '[70-80)', value = 80, inplace=True)
df['age'].replace(to_replace = '[80-90)', value = 90, inplace=True)
df['age'].replace(to_replace = '[90-100)', value = 100, inplace=True)


In [None]:
df.head(15)

In [None]:
df.groupby(['time_in_hospital','readmitted'])['readmitted']
df

In [None]:
df.groupby(['time_in_hospital','readmitted'])['readmitted'].count().plot(kind='barh',figsize = (20,10))

In [None]:
indexes_to_drop1 = list(df[df["race"] == "?"].index)
indexes_to_drop2 = list(df[df["diag_1"] == '?'].index)
indexes_to_drop3 = list(df[df["diag_2"] == '?'].index)
indexes_to_drop4 = list(df[df["diag_3"] == '?'].index)

In [None]:
indexes_to_drop = []
indexes_to_drop.extend(indexes_to_drop1)
indexes_to_drop.extend(indexes_to_drop2)
indexes_to_drop.extend(indexes_to_drop3)
indexes_to_drop.extend(indexes_to_drop4)
indexes_to_drop = set(indexes_to_drop)

In [None]:
df.drop(index = indexes_to_drop, inplace = True)

In [None]:
df['diag_1'].head(15)

In [None]:
df['diag_2'].head(15)

In [None]:
df['diag_3'].head(15)

In [None]:
#Feature Engineering
#create a column to group <30 and >30 into True and 'NO' to False.
df['readmit'] = df['readmitted'].map({'NO': False, '>30': True, '<30': True})
df

In [None]:
df.groupby(['time_in_hospital', 'readmitted'])['patient_nbr', 'encounter_id'].count()

In [None]:
df.groupby(['time_in_hospital', 'gender'])['patient_nbr'].count()

In [None]:
print(df.describe())

In [None]:
# visualize readmitted count by gender
df.groupby(['time_in_hospital', 'gender'])['patient_nbr'].count()
x = list(df['time_in_hospital'].unique())
y_male = list(df.loc[:][df['gender'] == 'Male'].groupby(['time_in_hospital', 'gender'])['patient_nbr'].count())
y_female = list(df.loc[:][df['gender'] == 'Female'].groupby(['time_in_hospital', 'gender'])['patient_nbr'].count())

print(x)
print(y_male)
print(y_female)

# Use plot() function to create a plot using above values on both x and y coordinates. Add a label.
width = 0.25 #try to show separately as many column as possible
#plt.bar(x, y_male, label='Male')
plt.bar(x, y_male, width, color='orange', label='Male') 
plt.bar([i + width for i in x], y_female, width, color='green', label='Female') 

# Add labels for x and y axes
plt.xlabel('Time in Hospital (days)')
plt.ylabel('Total Count')

# Add a title for the plot
plt.title(' Number of Readmitted by Gender')

# Add a legend to the plot with legend() in lower right corner
plt.legend()

#Resize the figure to allow for better visualizaton 
plt.figure(figsize=(60,30))
# Output the final plot
plt.show()

In [None]:
# visualize readmitted count by gender
df.groupby(['age', 'readmitted'])['patient_nbr'].count()
x = list(df['age'].unique())
y_1 = df.loc[:][df['readmitted'] == '<30'].groupby(['age', 'readmitted'])['patient_nbr'].count()
y_2 = df.loc[:][df['readmitted'] == '>30'].groupby(['age', 'readmitted'])['patient_nbr'].count()
y_3 = df.loc[:][df['readmitted'] == 'NO'].groupby(['age', 'readmitted'])['patient_nbr'].count()

print(x)
print(y_1)
print(y_2)

# Use plot() function to create a plot using above values on both x and y coordinates. Add a label.
width = 2.5 #try to show separately as many column as possible
plt.bar(x, y_1, width, color='red', label='<30') 
plt.bar([i + width for i in x], y_2, width, color='green', label='>30') 
plt.bar([i + width*2 for i in x], y_3, width, color='orange', label='NO') 

# Add labels for x and y axes
plt.xlabel('Time in Hospital (days)')
plt.ylabel('Total count')

# Add a title for the plot
plt.title('Readmitted Times by Age')

# Add a legend to the plot with legend() in lower right corner
plt.legend()

# Output the final plot
plt.show()

In [1]:
var = df['diag_1'].value_counts()
var[0:20].plot(kind='barh',figsize = (20,10))
plt.ylabel = 'Diagnosis ID'
plt.xlabel= 'Rate'
plt.show()
#plot(kind='barh',figsize=(20,10))

NameError: name 'df' is not defined