# Data Cleaning/Cleansing/Wrangling

## Data cleaning will cover the following
1. Checking the columns and changing mispelled to correct names
2. checking missing values
3. checking for duplicates
4. checking for outliers

In [66]:
#Import the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [67]:
print(sns.get_dataset_names(), end =" ")

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic'] 

### 1. Loading the dataset and getting information

In [68]:
#Reading the file: load diabetes.csv
df = pd.read_csv("diabetes.csv")

In [69]:
# Checking top 5 rows
df.head()

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50.0,4.7,46.0,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26.0,4.5,62.0,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50.0,4.7,46.0,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50.0,4.7,46.0,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33.0,7.1,46.0,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


In [70]:
#checking the last 5 columns
df.tail()

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
1004,191,454316,M,55.0,,62.0,6.8,5.3,2.0,1.0,3.5,0.9,30.1,Y
1005,192,454316,M,55.0,4.8,88.0,,5.7,4.0,0.9,3.3,1.8,30.0,Y
1006,193,454316,M,62.0,6.3,82.0,6.7,5.3,2.0,1.0,3.5,,30.1,Y
1007,194,454316,F,57.0,4.1,70.0,9.3,5.3,3.3,1.0,1.4,1.3,29.0,Y
1008,195,4543,f,55.0,4.1,34.0,13.9,5.4,1.6,1.6,3.1,0.7,33.0,Y


In [71]:
#checking the dataframe shape
df.shape

(1009, 14)

In [72]:
#checking the dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         1009 non-null   int64  
 1   No_Pation  1009 non-null   int64  
 2   Gender     1009 non-null   object 
 3   AGE        1008 non-null   float64
 4   Urea       1008 non-null   float64
 5   Cr         1007 non-null   float64
 6   HbA1c      1006 non-null   float64
 7   Chol       1007 non-null   float64
 8   TG         1007 non-null   float64
 9   HDL        1008 non-null   float64
 10  LDL        1007 non-null   float64
 11  VLDL       1008 non-null   float64
 12  BMI        1009 non-null   float64
 13  CLASS      1009 non-null   object 
dtypes: float64(10), int64(2), object(2)
memory usage: 110.5+ KB


In [73]:
#checking the info 2
# Excludes data columns
df.info(verbose = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Columns: 14 entries, ID to CLASS
dtypes: float64(10), int64(2), object(2)
memory usage: 110.5+ KB


In [74]:
#concise summary statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,1009.0,339.161546,239.7382,1.0,127.0,296.0,548.0,800.0
No_Pation,1009.0,271744.776016,3365681.0,123.0,24065.0,34399.0,45390.0,75435657.0
AGE,1008.0,53.62004,8.740975,25.0,51.0,55.0,59.0,79.0
Urea,1008.0,5.131094,2.931136,0.5,3.7,4.6,5.7,38.9
Cr,1007.0,68.973188,59.8133,6.0,48.0,60.0,73.0,800.0
HbA1c,1006.0,8.284155,2.533576,0.9,6.5,8.0,10.2,16.0
Chol,1007.0,4.863873,1.297326,0.0,4.0,4.8,5.6,10.3
TG,1007.0,2.348769,1.397487,0.3,1.5,2.0,2.9,13.8
HDL,1008.0,1.204216,0.6581583,0.2,0.9,1.1,1.3,9.9
LDL,1007.0,2.610119,1.116095,0.3,1.8,2.5,3.3,9.9


In [75]:
#Describe categorical features
df.describe(include="O").T

Unnamed: 0,count,unique,top,freq
Gender,1009,3,M,570
CLASS,1009,5,Y,840


In [76]:
#Create a dataframe copy to be used in data cleaning using 
# copy() method
# deep = True: makes a complete copy of the DataFrame
df1 = df.copy(deep = True)
df1.sample(5)

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
516,369,34411,M,57.0,8.9,120.0,6.3,4.2,3.9,1.4,1.3,1.5,29.0,Y
781,548,14389,F,60.0,4.1,56.0,13.1,4.4,2.0,1.0,2.5,0.9,29.0,Y
430,156,25368,F,60.0,4.5,39.0,8.7,5.2,2.1,0.8,3.6,0.9,27.0,Y
360,362,24092,F,56.0,2.3,36.0,10.2,4.8,1.0,1.4,2.9,0.4,30.0,Y
814,691,34560,F,56.0,3.6,48.0,15.0,7.7,4.7,5.0,2.1,2.0,28.0,Y


In [77]:
#check unique value for one column (Gender)
print(df1['Gender'].unique())
#  Counts the unique values in the column
print(df1['Gender'].value_counts())

['F' 'M' 'f']
M    570
F    437
f      2
Name: Gender, dtype: int64


In [78]:
# Replaces "f" with "F"
df1["Gender"] = df1["Gender"].replace('f',"F")
# Checks unique values in Gender Column
print(df1['Gender'].unique())
#  Counts the unique values in the column
print(df1['Gender'].value_counts())

['F' 'M']
M    570
F    439
Name: Gender, dtype: int64


In [79]:
# check for unique values in each column at once using for loop
for coln in df1:
    coln_value = df1[coln].unique()
    print(f"{coln}\n {coln_value}\n")

ID
 [502 735 420 680 504 634 721 421 670 759 636 788  82 132 402 566 596 676
 729 742  64  97 126 390 458 470 494 564 696 106 138 315 650 678 699 703
 708 719 740 741 746 794 219 255 450 626 664 704 799 210 237 318 620 640
 668 701 174 276 345 482 630 789 348   7   9  10  42  47  71  72  83  88
  89  96  98  99 102 134 137 139 141 143 144 145 147 149 150 153 154 157
 161 165 166 171 180 181 182 186 190 197 435 472  85 710 429 702   4 189
 201 285 393 468 492 496 498 684 700 716 366 399 490 500 672 690 718 264
 464 466 686 688  16 705 709  19 123 476 682 706 754  68 342 712 715  12
  20  41  60  79  81 133 135 146 152 178  18  24 675  39 474 648  48 656
  57 658  69 662 714  87 231 505 654  61 127 228 674 744  58 108 198 357
 588 666 698 753 110 113 119 122 452 652 660 673 162 168 580 694 790 140
 375 426 453 534 612 692 717 725 777  67 109 351 552 685 750  31 129 173
 176 179 183 185 195 316 646 187 188 191 194 200 203 411 520 622 758 206
 600  76  86 209 212 215 218 221 224 227 438 45

### 2. Data Cleaning


#### 2.1. Checking the colunms and dealing with mispelled columns

In [80]:
#check the colunms use columns attribute
df1.columns

Index(['ID', 'No_Pation', 'Gender', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG',
       'HDL', 'LDL', 'VLDL', 'BMI', 'CLASS'],
      dtype='object')

In [81]:
#changing no_pation column, use rename method 
#df.rename(columns = {'old col name': 'new coln name'}, inplace = True)
#inplace = True: replaces the column_name permanently
df1.rename(columns = {"No_Pation": "Patient No"}, inplace = True)
df1.columns

Index(['ID', 'Patient No', 'Gender', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol',
       'TG', 'HDL', 'LDL', 'VLDL', 'BMI', 'CLASS'],
      dtype='object')

In [82]:
#Change columns name to lower case for unifromity
# use .lower(): df.columns = df.columns.str.lower()
df1.columns = df1.columns.str.lower()
df1.columns

Index(['id', 'patient no', 'gender', 'age', 'urea', 'cr', 'hba1c', 'chol',
       'tg', 'hdl', 'ldl', 'vldl', 'bmi', 'class'],
      dtype='object')

In [83]:
#Removing the white space if any, 
#use .replace() e.g df.columns = df.columns.str.replace(' ', '')
df1.columns = df1.columns.str.replace(" ", "_")
df1.columns

Index(['id', 'patient_no', 'gender', 'age', 'urea', 'cr', 'hba1c', 'chol',
       'tg', 'hdl', 'ldl', 'vldl', 'bmi', 'class'],
      dtype='object')

In [84]:
#we can combine the 2 argument into one code
#df1.columns = df1.columns.str.lower().replace(' ', '_')
# Method chaining
df1.columns = df1.columns.str.lower().str.replace(" ", "_")
df1.columns

Index(['id', 'patient_no', 'gender', 'age', 'urea', 'cr', 'hba1c', 'chol',
       'tg', 'hdl', 'ldl', 'vldl', 'bmi', 'class'],
      dtype='object')

In [119]:
#Drop unnessary columns
#a. del df['col name'] # you can also used drop method: 
#Syntax: df.drop(columns = 'col_name', axis=1, inplace=True)

df1.drop(columns = "id",axis = 1, inplace = True)

KeyError: "['id'] not found in axis"

In [120]:
df1

Unnamed: 0,patient_no,gender,age,urea,cr,hba1c,chol,tg,hdl,ldl,vldl,bmi,class
0,17975,F,50.0,4.7,46.0,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,34221,M,26.0,4.5,62.0,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,47975,F,50.0,4.7,46.0,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,87656,F,50.0,4.7,46.0,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,34223,M,33.0,7.1,46.0,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004,454316,M,55.0,,62.0,6.8,5.3,2.0,1.0,3.5,0.9,30.1,Y
1005,454316,M,55.0,4.8,88.0,,5.7,4.0,0.9,3.3,1.8,30.0,Y
1006,454316,M,62.0,6.3,82.0,6.7,5.3,2.0,1.0,3.5,,30.1,Y
1007,454316,F,57.0,4.1,70.0,9.3,5.3,3.3,1.0,1.4,1.3,29.0,Y


### 2.2 Checking the missing values and dealing with them

In [86]:
#check the missing values, you can either use isna or isnull()


In [87]:
#a. imputing "hba1c " variable using the mean
#Get the mean  e.g mean_values = df.coln.mean()


#fill up the missing value, use fillna e.g df['col'].fillna(mean_values, inplace=True)


In [88]:
#b another way of doing the same thing as a bove e.g df['col'].fillna(df['col'].mean(), inplace=True)


In [89]:
#c Imputing using the median" replace mean with median


In [90]:
#d. how to impute using the mode
#coln_mode = df.colname.mode()[0]


# impute with the mode
#df['col'].fillna(col_mode, inplace=True) # use bfill for backword fill, and ffill for forward fill

Observations:

The mode of a dataset can have multiple values. When we call `mode()` on a Pandas dataframe, it returns a Series object containing all the modal values. To access the first modal value, we use the `[0]` index.

For example, if the mode of the `gender` column in the `df1` dataframe is both `'Female'` and `'Male'`, then `df1.gender.mode()` will return a Series object containing both values. To access the first modal value, which is `'Female'`, we use `df1.gender.mode()[0]`.

Using `[0]` ensures that we are only using the first modal value for imputation, rather than all the modal values. This is important because using multiple values for imputation can introduce bias into the data.

In [91]:
#e. Fill missing values with specific values e.g 0
#df['col'].fillna(0, inplace=True)

In [92]:
#f.Fill missing values in the categorical columns with specific values
#df['col'].fillna('unknown', inplace=True)


In [93]:
#dropping the missing values# we drop when we just have few missing values using dropna


In [94]:
#check the null values using isna()


In [95]:
#Replacing values in the class column
#df.groupby('class')['class'].count()

In [96]:
#check unique in class variable


In [97]:
#replace values in class, and gender coln df['coln'] = df['coln'].str.replace('N ','N')



In [98]:
#check unique again


In [99]:
#check count again  df.groupby('class')['class'].count()

In [100]:
#do the same for gender


In [101]:
#replace values in gender


### 2.3 Checking for duplicate


In [102]:
#check duplicates use duplcated()


In [103]:
#drop the duplicates


#checking on whether duplicates were dropped


### 2.4 Checking for outliers


In [104]:
#checking for outlier, use boxplot on cr coln


In [105]:
#Remove the outlier using the maximum quantile in cr column

#a. Get the max interquantile at 0.995


In [106]:
#check the values under outliers
#df[df["cr"] > max_cr]

In [107]:
#Remove the outlier by assigning the value to a new DataFrame
#dfnew = df[df["cr"] < max_cr]

In [108]:
#confirm removal of outlier

The outlier has been removed

In [109]:
#b. similary we can use iqr to remove the outliers

# Calculate the interquartile range (IQR)
#q1 = df['cr'].quantile(0.25)
#q3 = df['cr'].quantile(0.75)
#iqr = q3 - q1

# Define the lower and upper bounds for outliers
#lower_bound = q1 - 1.5 * iqr
#upper_bound = q3 + 1.5 * iqr

# Filter the data to remove outliers
#f2 = df1[(df['cr'] >= lower_bound) & (df['cr'] <= upper_bound)]

# Check the boxplot again
