# B2. DATA PREPARATION AND REMOVAL OF OUTLIERS

__2.1 Importing Libraries and Dataset__

__2.2 Removing Outliers__
    
__2.3 Scaling the Datasets__

__2.4 Saving the Datasets__

## 2.1 Importing Libraries and Dataset

In [1]:
# Libraries

import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
import warnings

warnings.simplefilter(action='ignore', category = FutureWarning)
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)

In [2]:
# Dataset

data = pd.read_csv("Blood_Glucose_Dataset.csv") 
df = data.copy()
df.head(5)

Unnamed: 0,Post Operative Glucose,Sex,Age,BMI,Smoking,Pre Operative Fast,Cort,Pre Operative Glucose,LV,ASA
0,114,1,64,22.49,0.0,13.92,0,111,2,3
1,140,0,25,21.97,0.0,12.0,1,85,2,2
2,199,0,53,18.9,0.0,15.0,1,106,1,2
3,127,1,76,23.94,100.0,11.5,0,84,1,3
4,120,0,47,20.2,40.0,15.25,1,94,1,3


In [3]:
# Shuffling the dataset
df = df.sample(frac = 1, random_state=126)

## 2.2 Removing Outliers

In [4]:
# Instantiating the Local Outlier Factor

lof_model = LocalOutlierFactor(n_neighbors=20, contamination=0.02) # 2% of the entries are outliers

In [5]:
# Fitting the LOF
lof_model.fit(df)

In [6]:
# Predict which of the entries are outliers
outlier_labels = lof_model.fit_predict(df)

In [7]:
# Display results
print("Outlier Labels:")
print(outlier_labels)

Outlier Labels:
[ 1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1
  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1]


In [8]:
# How many outliers there are in the dataset
How_many_outliers = 0
for i in range(0,len(outlier_labels)):
    if outlier_labels[i] == -1:
        How_many_outliers = How_many_outliers + abs(outlier_labels[i])
How_many_outliers

5

In [9]:
# Shape of dataset before outlier removal
df.shape

(235, 10)

In [10]:
# Removing Outliers
df_No_Outliers = df[outlier_labels != -1]  # Select rows where the LOF label is not -1 (not an outlier)

In [11]:
# Shape of dataset after outlier removal
df_No_Outliers.shape

(230, 10)

We have created a dataset without outliers

## 2.3 Scaling the datasets

In [12]:
# Defining Ys and Xs
Y_df = df['Post Operative Glucose']
X_df = df.drop('Post Operative Glucose', axis = 1)
X_df.head(1)

Unnamed: 0,Sex,Age,BMI,Smoking,Pre Operative Fast,Cort,Pre Operative Glucose,LV,ASA
101,1,49,22.45,47.0,11.33,1,102,1,2


In [13]:
# Defining Ys and Xs
Y_df_No_Outliers = df_No_Outliers['Post Operative Glucose']
X_df_No_Outliers = df_No_Outliers.drop('Post Operative Glucose', axis = 1)
X_df_No_Outliers.head(1)

Unnamed: 0,Sex,Age,BMI,Smoking,Pre Operative Fast,Cort,Pre Operative Glucose,LV,ASA
101,1,49,22.45,47.0,11.33,1,102,1,2


In [14]:
# Dividing X to numeric and categorical dataframes
X_df_num = X_df.drop(columns = ['Sex','Cort', 'LV','ASA'])
X_df_cat = X_df[['Sex','Cort', 'LV','ASA']]

X_df_No_Outliers_num = X_df_No_Outliers.drop(columns = ['Sex','Cort', 'LV','ASA'])
X_df_No_Outlier_cat = X_df_No_Outliers[['Sex','Cort', 'LV','ASA']]

In [15]:
# Standardizing the numeric predictors 

X_df_num_stand = X_df_num/np.std(X_df_num).values
X_df_num_stand_No_Outliers = X_df_No_Outliers_num/np.std(X_df_No_Outliers_num).values

In [16]:
# Concatenating numeric and categorical dataframes

X_df_stand  = pd.concat([X_df_num_stand.round(3), X_df_cat], axis =  1)
X_df_stand_No_Outliers  = pd.concat([X_df_num_stand_No_Outliers.round(3), X_df_No_Outlier_cat], axis =  1)

In [17]:
# Displaying the original dataset standarzized
X_df_stand.head()

Unnamed: 0,Age,BMI,Smoking,Pre Operative Fast,Pre Operative Glucose,Sex,Cort,LV,ASA
101,3.346,4.337,1.549,5.791,7.307,1,1,1,2
220,4.302,5.958,1.647,7.283,7.952,1,0,1,3
225,4.097,6.105,0.675,5.75,6.949,0,0,1,2
207,3.892,5.657,0.0,6.731,6.376,1,0,1,3
188,3.687,5.0,0.0,5.791,6.519,1,0,1,2


In [18]:
# Displaying the  dataset without outliers standarzized
X_df_stand_No_Outliers.head()

Unnamed: 0,Age,BMI,Smoking,Pre Operative Fast,Pre Operative Glucose,Sex,Cort,LV,ASA
101,3.321,4.427,1.574,5.794,8.838,1,1,1,2
220,4.27,6.081,1.674,7.288,9.618,1,0,1,3
225,4.067,6.231,0.686,5.753,8.405,0,0,1,2
207,3.863,5.773,0.0,6.735,7.711,1,0,1,3
188,3.66,5.103,0.0,5.794,7.885,1,0,1,2


In [19]:
# Concatenating the X and Y
df_stand = pd.concat([X_df_stand, Y_df], axis =  1)
df_stand.head(1)

Unnamed: 0,Age,BMI,Smoking,Pre Operative Fast,Pre Operative Glucose,Sex,Cort,LV,ASA,Post Operative Glucose
101,3.346,4.337,1.549,5.791,7.307,1,1,1,2,129


In [20]:
df_stand_No_Outliers = pd.concat([X_df_stand_No_Outliers, Y_df_No_Outliers], axis =  1)
df_stand_No_Outliers.head(1)

Unnamed: 0,Age,BMI,Smoking,Pre Operative Fast,Pre Operative Glucose,Sex,Cort,LV,ASA,Post Operative Glucose
101,3.321,4.427,1.574,5.794,8.838,1,1,1,2,129


In [21]:
# Checking the shape
df_stand.shape

(235, 10)

In [22]:
df_stand_No_Outliers.shape

(230, 10)

## 2.4 Saving the Datasets

In [23]:
# Dividing the original dataset to Train and Test Dataset

Train_df = df_stand[0:200]
Test_df = df_stand[200:]

Train_df.to_csv(r'BG_Train_Dataset.csv', index=False)
Test_df.to_csv(r'BG_Test_Dataset.csv', index=False)

In [24]:
# Dividing the dataset without outliers to Train and Test Dataset as to resemble as much as possible the previous ones.

Train_df = df_stand_No_Outliers[0:200]
Test_df = df_stand_No_Outliers[200:]

Train_df.to_csv(r'BG_Train_Dataset_No_Outliers.csv', index=False)
Test_df.to_csv(r'BG_Test_Dataset_No_Outliers.csv', index=False)