# FINAL YEAR PROJECT

Importing libraries and loading dataset

In [None]:
#Loading libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

: 

In [None]:
df = pd.read_csv("./dataset/diabetic_data.csv")

: 

### Data Analysis

In [None]:
df.head(10).T

: 

In [None]:
#checking shape of the dataset
df.shape

: 

In [None]:
#Checking data types of each variable
df.dtypes

: 

In [None]:
df.describe().T

: 

## Data Exploration and Cleansing

In [None]:
sns.countplot(x=df.readmitted, data=df, palette="pastel", edgecolor=".3")
plt.show()

: 

The target variable under consideration is the hospital readmission status of the patient within a time frame of 30 days. The variable has three categories: "<30", ">30", and "No Readmission". In order to simplify the problem, we will convert the multi-class classification task into a binary classification task.

In [None]:
#checking target attribute 
df['readmitted'] = df['readmitted'].replace('NO', 0)
df['readmitted'] = df['readmitted'].replace('<30', 0)
df['readmitted'] = df['readmitted'].replace('>30', 1)
df['readmitted'].value_counts()

: 

In [None]:
labels=['0','1']
df.readmitted.value_counts().plot.pie(autopct="%1.2f%%",labels=labels)
plt.show()

: 

In [None]:
#missing values checking
for col in df.columns:
    if df[col].dtype == object:
         print(col,df[col][df[col] == '?'].count())

: 

In [None]:
#Replacing missing value
df.replace('?', np.nan , inplace=True)
df["race"].fillna(df["race"].mode()[0], inplace = True)

#Dropping values with Unkown gender
df.gender.replace('Unknown/Invalid', np.nan , inplace=True)
df.dropna(subset=['gender'], how='all', inplace = True)
df["race"].isnull().sum()

: 

In [None]:
# Get count of unique values for all columns
unique_counts = df.apply(pd.Series.nunique)

# Sort the unique value counts in descending order
unique_counts_sorted = unique_counts.sort_values(ascending=True)

# Print the sorted unique value counts
print(unique_counts_sorted)

: 

Therefore we will be droppping some of these these attributes based on: 
1) the amount of misssing values
2) number if unique values
3) Logically see if some attributes affect readmission(Ex patient id)

In [None]:
drop_list = ['examide' , 'citoglipton', 'weight','encounter_id','patient_nbr','payer_code','medical_specialty']  
df.drop(drop_list,axis=1, inplace=True)

: 

### Determining of Numerical and Categorical Columns

In [None]:
# Find numerical columns
num_cols = df.select_dtypes(include=['int64']).columns.tolist()                                                                                                                                                                                                                                                                                                                                      

#Removing them because even though they have numbers. The numbers represent a category.Refer ID mapping.
num_cols.remove('admission_type_id')
num_cols.remove('discharge_disposition_id')
num_cols.remove('admission_source_id')
num_cols.remove('readmitted')

len(num_cols), num_cols

: 

In [None]:
# Find categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols.append('admission_type_id')
cat_cols.append('discharge_disposition_id')
cat_cols.append('admission_source_id')
cat_cols.append('readmitted')
len(cat_cols), cat_cols

: 

##### Co-relation between values

In [None]:
f,ax = plt.subplots(figsize=(8, 6))
sns.heatmap(df[num_cols].corr(), annot=True, linewidths=0.5,linecolor="black", fmt= '.2f',ax=ax,cmap="coolwarm")
plt.show()

: 

### Value Distribution

In [None]:
count=0
for i in df:
    count+=1
    if(count<25):
        print(i)

: 

In [None]:
Visualising_list={'race','gender','age','admission_type_id','discharge_disposition_id','admission_source_id',
'time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_outpatient',
'number_emergency','number_inpatient'}
for i in Visualising_list:
    a=df[i]
    print(f"\n-------------------------------------------------------------------------------------------------------\n{i}")
    print(a.value_counts())
    fig = plt.figure(figsize=(9,5))
    sns.countplot(y= df[i], hue = df.readmitted).set_title(f'{i} VS. Readmission')
    plt.show()

: 

In [None]:
#Checking how much insulin affects readmission rate
df.groupby(by = "insulin").readmitted.mean()

: 

In [None]:
print(df.insulin.value_counts())
sns.countplot(x="insulin", hue="readmitted", data=df).set_title('Insulin Consumption VS. Readmission')
plt.show()

: 

## Feature engineering

In [None]:
# re-encoding admission type, discharge type and admission source into fewer categories 
df = df.loc[~df.discharge_disposition_id.isin([11,13,14,19,20,21])]

: 

### Removing Outliers

In [None]:
count = 0
fig, ax =plt.subplots(nrows=2,ncols=4, figsize=(16,8))
for i in range(2):
    for j in range(4):
        sns.boxplot(x = df[num_cols[count]], palette=["#7FFFD4"],ax=ax[i][j])  # palette = rocket, Wistia
        count = count+1
#Fun fact:IQR value of 1.5x determines what are outliers

: 

In [None]:
type(num_cols)

: 

In [None]:
df.shape

: 

In [None]:
"""for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 3 * IQR
        upper_bound = Q3 + 3 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

df.shape
#IQR (76461, 43)"""

: 

In [None]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.03)
clf.fit(df[num_cols])
outliers = clf.predict(df[num_cols])
df = df[outliers != -1]
df.shape

: 

In [None]:
#Replacing missing value
df.replace('?', np.nan , inplace=True)
df["race"].isnull().sum()

: 

In [None]:
print(df['diag_1'].isnull().sum())
print(df['diag_2'].isnull().sum())
print(df['diag_3'].isnull().sum())

: 

In [None]:
df['diag_1'].fillna('NaN', inplace=True)
df['diag_2'].fillna('NaN', inplace=True)
df['diag_3'].fillna('NaN', inplace=True)
print(df['diag_1'].nunique())
print(df['diag_2'].nunique())
print(df['diag_3'].nunique())

: 

In [None]:
df['diag_1'].value_counts()

: 

In [None]:
diag_cols = ['diag_1','diag_2','diag_3']
for col in diag_cols:
    df[col] = df[col].str.replace('E','0')
    df[col] = df[col].str.replace('V','0')
    df[col] = df[col].str.replace('NaN','-1')
    #because it has 250.0X as value which makes it hard to encode
    condition = df[col].str.contains('250')
    df.loc[condition,col] = '250'

: 

In [None]:
for i in df['diag_1']:
    if(i=="?" or i=="NaN" or i=="E" or i == "V57" or i=="250.03"):
        print(i)

: 

In [None]:
type(df['diag_1'][1])

: 

In [None]:
df[diag_cols] = df[diag_cols].astype(float)

: 

In [None]:
type(df['diag_1'][1])

: 

In [None]:
df['A1Cresult'] = df['A1Cresult'].map({'Norm': 0, '>7': 1, '>8': 1, 'None': -99})

: 

In [None]:
df['max_glu_serum'] = df['max_glu_serum'].map({'Norm': 0, '>200': 1, '>300': 1, 'None': -99})

: 

*** ICD CODES FROM WIKIPEDIA ***

- List of ICD-9 codes 001–139: infectious and parasitic diseases
- List of ICD-9 codes 140–239: neoplasms
- List of ICD-9 codes 240–279: endocrine, nutritional and metabolic diseases, and immunity disorders
- List of ICD-9 codes 280–289: diseases of the blood and blood-forming organs
- List of ICD-9 codes 290–319: mental disorders
- List of ICD-9 codes 320–389: diseases of the nervous system and sense organs
- List of ICD-9 codes 390–459: diseases of the circulatory system
- List of ICD-9 codes 460–519: diseases of the respiratory system
- List of ICD-9 codes 520–579: diseases of the digestive system
- List of ICD-9 codes 580–629: diseases of the genitourinary system
- List of ICD-9 codes 630–679: complications of pregnancy, childbirth, and the puerperium
- List of ICD-9 codes 680–709: diseases of the skin and subcutaneous tissue
- List of ICD-9 codes 710–739: diseases of the musculoskeletal system and connective tissue
- List of ICD-9 codes 740–759: congenital anomalies
- List of ICD-9 codes 760–779: certain conditions originating in the perinatal period
- List of ICD-9 codes 780–799: symptoms, signs, and ill-defined conditions
- List of ICD-9 codes 800–999: injury and poisoning
- List of ICD-9 codes E and V codes: external causes of injury and supplemental classification

In [None]:
def assign_category(value):
    if value >= 390 and value <= 459:
        group = 'Diseases of the Circulatory System'
    elif value >= 460 and value <= 519:
        group = 'Diseases of the Respiratory System'
    elif value >= 520 and value <= 579:
        group = 'Diseases of the Digestive System'
    elif value >= 580 and value <= 629:
        group = 'Diseases of the Genitourinary System'
    elif value >= 630 and value <= 679:
        group = 'Complications of Pregnancy, Childbirth, and the Puerperium'
    elif value >= 680 and value <= 709:
        group = 'Diseases of the Skin and Subcutaneous Tissue'
    elif value >= 710 and value <= 739:
        group = 'Diseases of the Musculoskeletal System and Connective Tissue'
    elif value >= 740 and value <= 759:
        group = 'Congenital Anomalies'
    elif value >= 760 and value <= 779:
        group = 'Certain Conditions Originating in the Perinatal Period'
    elif value >= 780 and value <= 799:
        group = 'Symptoms, Signs, and Ill-Defined Conditions'
    elif value >= 800 and value <= 999:
        group = 'Injury and Poisoning'
    else:
        group = 'Other'
    return group

: 

In [None]:
for col in diag_cols:
     df[col] = df[col].apply(assign_category)

: 

In [None]:
print(df.diag_1.value_counts())
sns.countplot(y="diag_1", hue="readmitted", data=df).set_title('Primary Diagnosis VS. Readmission')
plt.show()

: 

In [None]:
print(df.diag_2.value_counts())
sns.countplot(y="diag_2", hue="readmitted", data=df).set_title('Secondary Diagnosis VS. Readmission')
plt.show()

: 

In [None]:
print(df.diag_3.value_counts())
sns.countplot(y="diag_3", hue="readmitted", data=df).set_title('Additional Secondary Diagnosis VS. Readmission')
plt.show()

: 

### Encoding

In [None]:
drugs = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
        'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone',
        'metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']

: 

In [None]:
for col in drugs:
    df[col] = df[col].replace('No', 0)
    df[col] = df[col].replace('Steady', 1)
    df[col] = df[col].replace('Up', 1)
    df[col] = df[col].replace('Down', 1)
    df[col] = df[col].astype(int)

: 

In [None]:
# One hot Encoding Race to convert categorical values to numerical ones
one_hot_data = pd.get_dummies(df, columns=['race'], prefix=["enc"])

# One hot Encoding Admission, Discharge, and Admission Source IDs
columns_ids = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']
#one_hot_data = pd.get_dummies(one_hot_data, columns=columns_ids, dtype=int, prefix=columns_ids)
one_hot_data[columns_ids] = one_hot_data[columns_ids].astype('str')
one_hot_data = pd.get_dummies(one_hot_data, columns=columns_ids)

: 

In [None]:
df.info()

: 

In [None]:
# code age intervals [0-10) - [90-100) from 1-10
for i in range(0,10):
    df['age'] = df['age'].replace('['+str(10*i)+'-'+str(10*(i+1))+')', i+1)
df['age'] = df['age'].astype(int)
df['age'].value_counts()

: 

In [None]:
# For testing purposes
#df.to_csv('./dataset/PreprocessedData.csv')

: 

**Label Encoding**

In [None]:
df['gender'].value_counts()

: 

In [None]:
df['race'].value_counts()

: 

In [None]:
df['change'].value_counts()

: 

In [None]:
df['diabetesMed'].value_counts()

: 

In [None]:
df['diag_1'].value_counts()

: 

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['race'] = le.fit_transform(df['race'])
df['change'] = le.fit_transform(df['change'])
df['diabetesMed']=le.fit_transform(df['diabetesMed'])
df['diag_1']=le.fit_transform(df['diag_1'])
df['diag_2']=le.fit_transform(df['diag_2'])
df['diag_3']=le.fit_transform(df['diag_3'])

: 

In [None]:
df['diag_1'].value_counts()

: 

In [None]:
df['gender'].value_counts()

: 

In [None]:
df['race'].value_counts()

: 

In [None]:
df['change'].value_counts()

: 

In [None]:
df['diabetesMed'].value_counts()

: 

In [None]:
for i in df:
    print(df[i])


: 

In [None]:
print(df['diag_1'])

: 

In [None]:
from sklearn.utils import resample



not_readmitted = df[df.readmitted==0]
readmitted = df[df.readmitted==1]

not_readmitted_sampled = resample(not_readmitted,
                                replace = False, 
                                n_samples = len(readmitted),
                                random_state = 42)

downsampled = pd.concat([not_readmitted_sampled, readmitted])
downsampled.readmitted.value_counts()
df = pd.DataFrame(downsampled)

: 

In [None]:
df.to_csv('./dataset/Encoded_binary.csv')

: 