In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()

In [2]:
raw_data = pd.read_csv('cirrhosis.csv')
raw_data.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


'Status' is the target. Let's move it to the end of the dataframe.

In [3]:
raw_data.columns.values

array(['ID', 'N_Days', 'Status', 'Drug', 'Age', 'Sex', 'Ascites',
       'Hepatomegaly', 'Spiders', 'Edema', 'Bilirubin', 'Cholesterol',
       'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides',
       'Platelets', 'Prothrombin', 'Stage'], dtype=object)

In [5]:
columns_reordered = ['ID', 'N_Days', 'Drug', 'Age', 'Sex', 'Ascites',
       'Hepatomegaly', 'Spiders', 'Edema', 'Bilirubin', 'Cholesterol',
       'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides',
       'Platelets', 'Prothrombin', 'Stage',  'Status']

In [6]:
data = raw_data.copy()
data = data[columns_reordered]
data.head()

Unnamed: 0,ID,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,1,400,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0,D
1,2,4500,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0,C
2,3,1012,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0,D
3,4,1925,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0,D
4,5,1504,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0,CL


## Null value Handling

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             418 non-null    int64  
 1   N_Days         418 non-null    int64  
 2   Drug           312 non-null    object 
 3   Age            418 non-null    int64  
 4   Sex            418 non-null    object 
 5   Ascites        312 non-null    object 
 6   Hepatomegaly   312 non-null    object 
 7   Spiders        312 non-null    object 
 8   Edema          418 non-null    object 
 9   Bilirubin      418 non-null    float64
 10  Cholesterol    284 non-null    float64
 11  Albumin        418 non-null    float64
 12  Copper         310 non-null    float64
 13  Alk_Phos       312 non-null    float64
 14  SGOT           312 non-null    float64
 15  Tryglicerides  282 non-null    float64
 16  Platelets      407 non-null    float64
 17  Prothrombin    416 non-null    float64
 18  Stage     

There are 418 observations in the dataset.

In [8]:
data = data.drop('ID', axis=1)

Let's find the total number of null values per feature 

In [9]:
data.isnull().sum()

N_Days             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
Status             0
dtype: int64

Now, let's find these null values as a perteng of the total data.

In [10]:
(data.isnull().sum()/len(data))*100

N_Days            0.000000
Drug             25.358852
Age               0.000000
Sex               0.000000
Ascites          25.358852
Hepatomegaly     25.358852
Spiders          25.358852
Edema             0.000000
Bilirubin         0.000000
Cholesterol      32.057416
Albumin           0.000000
Copper           25.837321
Alk_Phos         25.358852
SGOT             25.358852
Tryglicerides    32.535885
Platelets         2.631579
Prothrombin       0.478469
Stage             1.435407
Status            0.000000
dtype: float64

### What are the different approaches to null value handling?

Approach 1: Drop the row that has missing values.

Approach 2: Drop the entire column if most of the values in the column has missing values.

Approach 3: Impute the missing data, that is, fill in the missing values with appropriate values.

Approach 4: Use an ML algorithm that handles missing values on its own, internally (ex: XGBoost).

#

In [11]:
# dropping rows with null values

updated_data = data.dropna(axis=0)

In [12]:
updated_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276 entries, 0 to 311
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         276 non-null    int64  
 1   Drug           276 non-null    object 
 2   Age            276 non-null    int64  
 3   Sex            276 non-null    object 
 4   Ascites        276 non-null    object 
 5   Hepatomegaly   276 non-null    object 
 6   Spiders        276 non-null    object 
 7   Edema          276 non-null    object 
 8   Bilirubin      276 non-null    float64
 9   Cholesterol    276 non-null    float64
 10  Albumin        276 non-null    float64
 11  Copper         276 non-null    float64
 12  Alk_Phos       276 non-null    float64
 13  SGOT           276 non-null    float64
 14  Tryglicerides  276 non-null    float64
 15  Platelets      276 non-null    float64
 16  Prothrombin    276 non-null    float64
 17  Stage          276 non-null    float64
 18  Status    

In [13]:
((418-276)/418)*100

33.97129186602871

We lose significant amount of data. Hence this is not a good method.

### Imputation

**Approach 1:** Impute with a constant number 

a) For numeric data:

-> Fill with 0 

-> Mean of entire column excluding the missing values

-> Median of entire column excluding the missing values

-> Mean or median of only those rows (of the variable) that has the same value of Y as the missing.

b) For categorical data:

-> Fill with a new label

-> Fill it with the most frequent data (mode)

**Approach 2:** Predict the missing value with machine learning using other non-missing columns as predictors.

In [14]:
# filling with 0

df_a1 = data.copy()

In [15]:
df_a1['Cholesterol'] = df_a1['Cholesterol'].fillna(0)
df_a1['Copper'] = df_a1['Copper'].fillna(0)
df_a1['Alk_Phos'] = df_a1['Alk_Phos'].fillna(0)
df_a1['SGOT'] = df_a1['SGOT'].fillna(0)
df_a1['Tryglicerides'] = df_a1['Tryglicerides'].fillna(0)
df_a1['Platelets'] = df_a1['Platelets'].fillna(0)
df_a1['Prothrombin'] = df_a1['Prothrombin'].fillna(0)

df_a1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         418 non-null    int64  
 1   Drug           312 non-null    object 
 2   Age            418 non-null    int64  
 3   Sex            418 non-null    object 
 4   Ascites        312 non-null    object 
 5   Hepatomegaly   312 non-null    object 
 6   Spiders        312 non-null    object 
 7   Edema          418 non-null    object 
 8   Bilirubin      418 non-null    float64
 9   Cholesterol    418 non-null    float64
 10  Albumin        418 non-null    float64
 11  Copper         418 non-null    float64
 12  Alk_Phos       418 non-null    float64
 13  SGOT           418 non-null    float64
 14  Tryglicerides  418 non-null    float64
 15  Platelets      418 non-null    float64
 16  Prothrombin    418 non-null    float64
 17  Stage          412 non-null    float64
 18  Status    

Now, we have a dataframe called **df_a1** that has all the missing values in numerical features filled with 0. Let's fill the missing values in categorical variables.

In [16]:
df_a1_b1 = df_a1.copy()

In [17]:
# filling with new label

df_a1_b1['Drug'] = df_a1_b1['Drug'].fillna('Empty')
df_a1_b1['Ascites'] = df_a1_b1['Ascites'].fillna('Empty')
df_a1_b1['Hepatomegaly'] = df_a1_b1['Hepatomegaly'].fillna('Empty')
df_a1_b1['Spiders'] = df_a1_b1['Spiders'].fillna('Empty')
df_a1_b1['Stage'] = df_a1_b1['Stage'].fillna('Empty')

df_a1_b1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         418 non-null    int64  
 1   Drug           418 non-null    object 
 2   Age            418 non-null    int64  
 3   Sex            418 non-null    object 
 4   Ascites        418 non-null    object 
 5   Hepatomegaly   418 non-null    object 
 6   Spiders        418 non-null    object 
 7   Edema          418 non-null    object 
 8   Bilirubin      418 non-null    float64
 9   Cholesterol    418 non-null    float64
 10  Albumin        418 non-null    float64
 11  Copper         418 non-null    float64
 12  Alk_Phos       418 non-null    float64
 13  SGOT           418 non-null    float64
 14  Tryglicerides  418 non-null    float64
 15  Platelets      418 non-null    float64
 16  Prothrombin    418 non-null    float64
 17  Stage          418 non-null    object 
 18  Status    

We have a dataframe called df_a1_b1 which has all the missing values in categorical features have been replaced by 'Empty', and in numerical features have been replaced by 0.

Replacing with mean for numerical features and replacing with the most frequent label for categorical features 

In [18]:
df_a2_b2 = data.copy()

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         418 non-null    int64  
 1   Drug           312 non-null    object 
 2   Age            418 non-null    int64  
 3   Sex            418 non-null    object 
 4   Ascites        312 non-null    object 
 5   Hepatomegaly   312 non-null    object 
 6   Spiders        312 non-null    object 
 7   Edema          418 non-null    object 
 8   Bilirubin      418 non-null    float64
 9   Cholesterol    284 non-null    float64
 10  Albumin        418 non-null    float64
 11  Copper         310 non-null    float64
 12  Alk_Phos       312 non-null    float64
 13  SGOT           312 non-null    float64
 14  Tryglicerides  282 non-null    float64
 15  Platelets      407 non-null    float64
 16  Prothrombin    416 non-null    float64
 17  Stage          412 non-null    float64
 18  Status    

In [20]:
df_a2_b2['Cholesterol'] = df_a2_b2['Cholesterol'].fillna(data['Cholesterol'].mean())
df_a2_b2['Copper'] = df_a2_b2['Copper'].fillna(data['Copper'].mean())
df_a2_b2['Alk_Phos'] = df_a2_b2['Alk_Phos'].fillna(data['Alk_Phos'].mean())
df_a2_b2['SGOT'] = df_a2_b2['SGOT'].fillna(data['SGOT'].mean())
df_a2_b2['Tryglicerides'] = df_a2_b2['Tryglicerides'].fillna(data['Tryglicerides'].mean())
df_a2_b2['Platelets'] = df_a2_b2['Platelets'].fillna(data['Platelets'].mean())
df_a2_b2['Prothrombin'] = df_a2_b2['Prothrombin'].fillna(data['Prothrombin'].mean())

df_a2_b2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         418 non-null    int64  
 1   Drug           312 non-null    object 
 2   Age            418 non-null    int64  
 3   Sex            418 non-null    object 
 4   Ascites        312 non-null    object 
 5   Hepatomegaly   312 non-null    object 
 6   Spiders        312 non-null    object 
 7   Edema          418 non-null    object 
 8   Bilirubin      418 non-null    float64
 9   Cholesterol    418 non-null    float64
 10  Albumin        418 non-null    float64
 11  Copper         418 non-null    float64
 12  Alk_Phos       418 non-null    float64
 13  SGOT           418 non-null    float64
 14  Tryglicerides  418 non-null    float64
 15  Platelets      418 non-null    float64
 16  Prothrombin    418 non-null    float64
 17  Stage          412 non-null    float64
 18  Status    

In [21]:
df_a2_b2['Drug'] = df_a2_b2.fillna(df_a2_b2['Drug'].value_counts().index[0]) 
df_a2_b2['Ascites'] = df_a2_b2.fillna(df_a2_b2['Ascites'].value_counts().index[0]) 
df_a2_b2['Hepatomegaly'] = df_a2_b2.fillna(df_a2_b2['Hepatomegaly'].value_counts().index[0]) 
df_a2_b2['Spiders'] = df_a2_b2.fillna(df_a2_b2['Spiders'].value_counts().index[0]) 
df_a2_b2['Stage'] = df_a2_b2.fillna(df_a2_b2['Stage'].value_counts().index[0]) 

In [22]:
df_a2_b2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         418 non-null    int64  
 1   Drug           418 non-null    object 
 2   Age            418 non-null    int64  
 3   Sex            418 non-null    object 
 4   Ascites        418 non-null    object 
 5   Hepatomegaly   418 non-null    object 
 6   Spiders        418 non-null    object 
 7   Edema          418 non-null    object 
 8   Bilirubin      418 non-null    float64
 9   Cholesterol    418 non-null    float64
 10  Albumin        418 non-null    float64
 11  Copper         418 non-null    float64
 12  Alk_Phos       418 non-null    float64
 13  SGOT           418 non-null    float64
 14  Tryglicerides  418 non-null    float64
 15  Platelets      418 non-null    float64
 16  Prothrombin    418 non-null    float64
 17  Stage          418 non-null    object 
 18  Status    

# Outlier Handling

N_Days:

In [None]:
sns.boxplot(x=df_a2_b2['N_Days'])

Age:

In [None]:
sns.boxplot(x=df_a2_b2['Age'])

cholesterol:

In [None]:
sns.boxplot(x=df_a2_b2['Cholesterol'])

In [None]:
Q1 = df_a2_b2['Cholesterol'].quantile(0.25)
Q3 = df_a2_b2['Cholesterol'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
Cholesterol_outliers = df_a2_b2[(df_a2_b2['Cholesterol'] < Q1 - whisker_width*IQR) | (df_a2_b2['Cholesterol'] > Q3 + whisker_width*IQR)]
Cholesterol_outliers.head()

In [None]:
Cholesterol_mean = df_a2_b2['Cholesterol'].mean()
Cholesterol_std = df_a2_b2['Cholesterol'].std()
low= Cholesterol_mean -(3 * Cholesterol_std)
high= Cholesterol_mean + (3 * Cholesterol_std)
Cholesterol_outliers = df_a2_b2[(df_a2_b2['Cholesterol'] < low) | (df_a2_b2['Cholesterol'] > high)]
Cholesterol_outliers

In [None]:
df_a2_b2 = df_a2_b2.drop('Cholesterol', axis=1)

Albumin:

In [None]:
sns.boxplot(x=df_a2_b2['Albumin'])

In [None]:
Q1 = df_a2_b2['Albumin'].quantile(0.25)
Q3 = df_a2_b2['Albumin'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
Albumin_outliers = df_a2_b2[(df_a2_b2['Albumin'] < Q1 - whisker_width*IQR) | (df_a2_b2['Albumin'] > Q3 + whisker_width*IQR)]
Albumin_outliers.head()

In [None]:
Albumin_mean = df_a2_b2['Albumin'].mean()
Albumin_std = df_a2_b2['Albumin'].std()
low= Albumin_mean -(3 * Albumin_std)
high= Albumin_mean + (3 * Albumin_std)
Albumin_outliers = df_a2_b2[(df_a2_b2['Albumin'] < low) | (df_a2_b2['Albumin'] > high)]
Albumin_outliers

In [None]:
Q1 = df_a2_b2['Albumin'].quantile(0.25)
Q3 = df_a2_b2['Albumin'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
lower_whisker = Q1 -(whisker_width*IQR)
upper_whisker = Q3 + (whisker_width*IQR)
df_a2_b2['Albumin']=np.where(df_a2_b2['Albumin']>upper_whisker,upper_whisker,
                       np.where(df_a2_b2['Albumin']<lower_whisker,lower_whisker,
                                df_a2_b2['Albumin']))

In [None]:
sns.boxplot(x=df_a2_b2['Albumin'])

Copper:

In [None]:
sns.boxplot(x=df_a2_b2['Copper'])

In [None]:
Q1 = df_a2_b2['Copper'].quantile(0.25)
Q3 = df_a2_b2['Copper'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
Copper_outliers = df_a2_b2[(df_a2_b2['Copper'] < Q1 - whisker_width*IQR) | (df_a2_b2['Copper'] > Q3 + whisker_width*IQR)]
Copper_outliers.head()

In [None]:
Copper_mean = df_a2_b2['Copper'].mean()
Copper_std = df_a2_b2['Copper'].std()
low= Copper_mean -(3 * Copper_std)
high= Copper_mean + (3 * Copper_std)
Copper_outliers = df_a2_b2[(df_a2_b2['Copper'] < low) | (df_a2_b2['Copper'] > high)]
Copper_outliers

In [None]:
Q1 = df_a2_b2['Copper'].quantile(0.25)
Q3 = df_a2_b2['Copper'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
lower_whisker = Q1 -(whisker_width*IQR)
upper_whisker = Q3 + (whisker_width*IQR)
df_a2_b2['Copper']=np.where(df_a2_b2['Copper']>upper_whisker,upper_whisker,
                       np.where(df_a2_b2['Copper']<lower_whisker,lower_whisker,
                                df_a2_b2['Copper']))

In [None]:
sns.boxplot(x=df_a2_b2['Copper'])

Alk_Phos:

In [None]:
sns.boxplot(x=df_a2_b2['Alk_Phos'])

In [None]:
Q1 = df_a2_b2['Alk_Phos'].quantile(0.25)
Q3 = df_a2_b2['Alk_Phos'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
Alk_Phos_outliers = df_a2_b2[(df_a2_b2['Alk_Phos'] < Q1 - whisker_width*IQR) | (df_a2_b2['Alk_Phos'] > Q3 + whisker_width*IQR)]
Alk_Phos_outliers.head()

In [None]:
Alk_Phos_mean = df_a2_b2['Alk_Phos'].mean()
Alk_Phos_std = df_a2_b2['Alk_Phos'].std()
low= Alk_Phos_mean -(3 * Alk_Phos_std)
high= Alk_Phos_mean + (3 * Alk_Phos_std)
Alk_Phos_outliers = df_a2_b2[(df_a2_b2['Alk_Phos'] < low) | (df_a2_b2['Alk_Phos'] > high)]
Alk_Phos_outliers

In [None]:
df_a2_b2 = df_a2_b2.drop('Alk_Phos', axis=1)

SGOT:

In [None]:
sns.boxplot(x=df_a2_b2['SGOT'])

In [None]:
Q1 = df_a2_b2['SGOT'].quantile(0.25)
Q3 = df_a2_b2['SGOT'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
SGOT_outliers = df_a2_b2[(df_a2_b2['SGOT'] < Q1 - whisker_width*IQR) | (df_a2_b2['SGOT'] > Q3 + whisker_width*IQR)]
SGOT_outliers.head()

In [None]:
SGOT_mean = df_a2_b2['SGOT'].mean()
SGOT_std = df_a2_b2['SGOT'].std()
low= SGOT_mean -(3 * SGOT_std)
high= SGOT_mean + (3 * SGOT_std)
SGOT_outliers = df_a2_b2[(df_a2_b2['SGOT'] < low) | (df_a2_b2['SGOT'] > high)]
SGOT_outliers

In [None]:
Q1 = df_a2_b2['SGOT'].quantile(0.25)
Q3 = df_a2_b2['SGOT'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
lower_whisker = Q1 -(whisker_width*IQR)
upper_whisker = Q3 + (whisker_width*IQR)
df_a2_b2['SGOT']=np.where(df_a2_b2['SGOT']>upper_whisker,upper_whisker,
                       np.where(df_a2_b2['SGOT']<lower_whisker,lower_whisker,
                                df_a2_b2['SGOT']))

In [None]:
sns.boxplot(x=df_a2_b2['SGOT'])

Tryglicerides:

In [None]:
sns.boxplot(x=df_a2_b2['Tryglicerides'])

In [None]:
Q1 = df_a2_b2['Tryglicerides'].quantile(0.25)
Q3 = df_a2_b2['Tryglicerides'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
Tryglicerides_outliers = df_a2_b2[(df_a2_b2['Tryglicerides'] < Q1 - whisker_width*IQR) | (df_a2_b2['Tryglicerides'] > Q3 + whisker_width*IQR)]
Tryglicerides_outliers.head()

In [None]:
Tryglicerides_mean = df_a2_b2['Tryglicerides'].mean()
Tryglicerides_std = df_a2_b2['Tryglicerides'].std()
low= Tryglicerides_mean -(3 * Tryglicerides_std)
high= Tryglicerides_mean + (3 * Tryglicerides_std)
Tryglicerides_outliers = df_a2_b2[(df_a2_b2['Tryglicerides'] < low) | (df_a2_b2['Tryglicerides'] > high)]
Tryglicerides_outliers

In [None]:
Q1 = df_a2_b2['Tryglicerides'].quantile(0.25)
Q3 = df_a2_b2['Tryglicerides'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
lower_whisker = Q1 -(whisker_width*IQR)
upper_whisker = Q3 + (whisker_width*IQR)
df_a2_b2['Tryglicerides']=np.where(df_a2_b2['Tryglicerides']>upper_whisker,upper_whisker,
                       np.where(df_a2_b2['Tryglicerides']<lower_whisker,lower_whisker,
                                df_a2_b2['Tryglicerides']))

In [None]:
sns.boxplot(x=df_a2_b2['Tryglicerides'])

Platelets:

In [None]:
sns.boxplot(x=df_a2_b2['Platelets'])

In [None]:
Q1 = df_a2_b2['Platelets'].quantile(0.25)
Q3 = df_a2_b2['Platelets'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
Platelets_outliers = df_a2_b2[(df_a2_b2['Platelets'] < Q1 - whisker_width*IQR) | (df_a2_b2['Platelets'] > Q3 + whisker_width*IQR)]
Platelets_outliers.head()

In [None]:
Platelets_mean = df_a2_b2['Platelets'].mean()
Platelets_std = df_a2_b2['Platelets'].std()
low= Platelets_mean -(3 * Platelets_std)
high= Platelets_mean + (3 * Platelets_std)
Platelets_outliers = df_a2_b2[(df_a2_b2['Platelets'] < low) | (df_a2_b2['Platelets'] > high)]
Platelets_outliers

In [None]:
Q1 = df_a2_b2['Platelets'].quantile(0.25)
Q3 = df_a2_b2['Platelets'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
lower_whisker = Q1 -(whisker_width*IQR)
upper_whisker = Q3 + (whisker_width*IQR)
df_a2_b2['Platelets']=np.where(df_a2_b2['Platelets']>upper_whisker,upper_whisker,
                       np.where(df_a2_b2['Platelets']<lower_whisker,lower_whisker,
                                df_a2_b2['Platelets']))

In [None]:
sns.boxplot(x=df_a2_b2['Platelets'])

Prothrombin:

In [None]:
sns.boxplot(x=df_a2_b2['Prothrombin'])

In [None]:
Q1 = df_a2_b2['Prothrombin'].quantile(0.25)
Q3 = df_a2_b2['Prothrombin'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
Prothrombin_outliers = df_a2_b2[(df_a2_b2['Prothrombin'] < Q1 - whisker_width*IQR) | (df_a2_b2['Prothrombin'] > Q3 + whisker_width*IQR)]
Prothrombin_outliers.head()

In [None]:
Prothrombin_mean = df_a2_b2['Prothrombin'].mean()
Prothrombin_std = df_a2_b2['Prothrombin'].std()
low= Prothrombin_mean -(3 * Prothrombin_std)
high= Prothrombin_mean + (3 * Prothrombin_std)
Prothrombin_outliers = df_a2_b2[(df_a2_b2['Prothrombin'] < low) | (df_a2_b2['Prothrombin'] > high)]
Prothrombin_outliers

In [None]:
Q1 = df_a2_b2['Prothrombin'].quantile(0.25)
Q3 = df_a2_b2['Prothrombin'].quantile(0.75)
IQR = Q3 - Q1
whisker_width = 1.5
lower_whisker = Q1 -(whisker_width*IQR)
upper_whisker = Q3 + (whisker_width*IQR)
df_a2_b2['Prothrombin']=np.where(df_a2_b2['Prothrombin']>upper_whisker,upper_whisker,
                       np.where(df_a2_b2['Prothrombin']<lower_whisker,lower_whisker,
                                df_a2_b2['Prothrombin']))

In [None]:
sns.boxplot(x=df_a2_b2['Prothrombin'])

# Save the cleaned data as a csv file

In [None]:
df_a2_b2.to_csv('data_cleaned.csv', index=False)

In [None]:
df = pd.read_csv('data_cleaned.csv')

In [None]:
df.head()

In [None]:
df.info()