# Reading the data into a dataframe using Pandas

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

from pandas import read_csv

diabetes = read_csv ('diabetes.csv')

diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [2]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [3]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


# Splitting the data into training and testing data

In [4]:
from pandas import DataFrame

from sklearn.metrics import accuracy_score

report = DataFrame()

In [5]:
X = diabetes.drop(columns =['Outcome'])
Y = diabetes['Outcome']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.1, random_state = 768)

len (X_train), len (X_test), len (Y_train), len (Y_test)

(691, 77, 691, 77)

# Running algorithms on your dataset

## (1) Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=140)

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Logistic Regression", "accuracy :", round(accuracy_score (report['actual'], report['predicted']), 2))

Logistic Regression accuracy : 0.75


## (2) K Nearest Neighbors

In [7]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("K Nearest Neighbors", "accuracy :", round(accuracy_score (report['actual'], report['predicted']), 2))

K Nearest Neighbors accuracy : 0.71


## (3) Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Decision Tree", "accuracy :", round(accuracy_score (report['actual'], report['predicted']), 2))

Decision Tree accuracy : 0.74


## (4) Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Random Forest", "accuracy :", round(accuracy_score (report['actual'], report['predicted']), 2))

Random Forest accuracy : 0.74


# Applying Feature Engineering

## minmax Scaler

In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split (X_scaled, Y, test_size = 0.1, random_state = 768)

len (X_train), len (X_test), len (Y_train), len (Y_test)

(691, 77, 691, 77)

### (1) Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=140)

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Logistic Regression", "accuracy :", round(accuracy_score (report['actual'], report['predicted']), 2))

Logistic Regression accuracy : 0.75


### (2) K Nearest Neighbors

In [12]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("K Nearest Neighbors", "accuracy after using minmax scaler:", round(accuracy_score (report['actual'], report['predicted']), 2))

K Nearest Neighbors accuracy after using minmax scaler: 0.77


### (3) Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Decision Tree", "accuracy after using minmax scaler:", round(accuracy_score (report['actual'], report['predicted']), 2))

Decision Tree accuracy after using minmax scaler: 0.71


### (4) Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Random Forest", "accuracy after using minmax scaler:", round(accuracy_score (report['actual'], report['predicted']), 2))

Random Forest accuracy after using minmax scaler: 0.78


## Standard Scaler

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split (X_scaled, Y, test_size = 0.1, random_state = 768)

len (X_train), len (X_test), len (Y_train), len (Y_test)

(691, 77, 691, 77)

### (1) Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=140)

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Logistic Regression", "accuracy after using standard scaler:", round(accuracy_score (report['actual'], report['predicted']), 2))

Logistic Regression accuracy after using standard scaler: 0.75


### (2) K Nearest Neighbors

In [17]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("K Nearest Neighbors", "accuracy after using standard scaler:", round(accuracy_score (report['actual'], report['predicted']), 2))

K Nearest Neighbors accuracy after using standard scaler: 0.77


### (3) Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Decision Tree", "accuracy after using standard scaler:", round(accuracy_score (report['actual'], report['predicted']), 2))

Decision Tree accuracy after using standard scaler: 0.69


### (4) Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Random Forest", "accuracy after using standard scaler:", round(accuracy_score (report['actual'], report['predicted']), 2))

Random Forest accuracy after using standard scaler: 0.75


# Principle Component Analysis

In [20]:
covarins = diabetes.cov()

covarins.sort_values(by='Outcome', ascending=False)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Insulin,-28.555231,1220.935799,198.378412,802.979941,13281.180078,179.775172,7.066681,-57.14329,7.175671
Glucose,13.947131,1022.248314,94.430956,29.239183,1220.935799,55.726987,1.454875,99.082805,7.115079
Age,21.57062,99.082805,54.523453,-21.381023,-57.14329,3.36033,0.130772,138.303046,1.336953
BMI,0.469774,55.726987,43.004695,49.373869,179.775172,62.159984,0.367405,3.36033,1.100638
BloodPressure,9.214538,94.430956,374.647271,64.029396,198.378412,43.004695,0.264638,54.523453,0.600697
SkinThickness,-4.390041,29.239183,64.029396,254.473245,802.979941,49.373869,0.972136,-21.381023,0.568747
Pregnancies,11.354056,13.947131,9.214538,-4.390041,-28.555231,0.469774,-0.037426,21.57062,0.356618
Outcome,0.356618,7.115079,0.600697,0.568747,7.175671,1.100638,0.027472,1.336953,0.227483
DiabetesPedigreeFunction,-0.037426,1.454875,0.264638,0.972136,7.066681,0.367405,0.109779,0.130772,0.027472


### As we can see, from all the above listed columns, Glucose and Insulin contribute towards the most variance. No other variable even comes close to these two.

### Therefore, We will be using only these two columns for further prediction

In [21]:
diabetes_clean = diabetes[['Insulin', 'Glucose', 'Outcome']]

diabetes_clean

Unnamed: 0,Insulin,Glucose,Outcome
0,0,148,1
1,0,85,0
2,0,183,1
3,94,89,0
4,168,137,1
...,...,...,...
763,180,101,0
764,0,122,0
765,112,121,0
766,0,126,1


In [22]:
# Using IterativeImputer for imputation

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer()

imputer.fit (diabetes_clean)

diabetes_clean['Glucose'].replace(0, None, inplace=True)
diabetes_clean['Insulin'].replace(0, None, inplace=True)

diabetes_clean = DataFrame(imputer.transform (diabetes_clean), columns = imputer.get_feature_names_out())

diabetes_clean

Unnamed: 0,Insulin,Glucose,Outcome
0,111.777741,148.0,1.0
1,37.395513,85.0,0.0
2,153.162493,183.0,1.0
3,94.000000,89.0,0.0
4,168.000000,137.0,1.0
...,...,...,...
763,180.000000,101.0,0.0
764,81.145108,122.0,0.0
765,112.000000,121.0,0.0
766,85.764469,126.0,1.0


In [23]:
# We will be using IsolationForest Algorithm for Outlier detection on the entire dataset

from sklearn.ensemble import IsolationForest

If = IsolationForest()

diabetes_clean['anomaly_score'] = If.fit_predict(diabetes_clean)

diabetes_clean

Unnamed: 0,Insulin,Glucose,Outcome,anomaly_score
0,111.777741,148.0,1.0,1
1,37.395513,85.0,0.0,1
2,153.162493,183.0,1.0,1
3,94.000000,89.0,0.0,1
4,168.000000,137.0,1.0,1
...,...,...,...,...
763,180.000000,101.0,0.0,1
764,81.145108,122.0,0.0,1
765,112.000000,121.0,0.0,1
766,85.764469,126.0,1.0,1


In [24]:
# Checking the total number of anomalies

diabetes_clean['anomaly_score'].value_counts()

anomaly_score
 1    600
-1    168
Name: count, dtype: int64

In [25]:
# Finding percentage of contamination

175/768

0.22786458333333334

#### As calculated above, the data contains roughly 23% contamination

In [26]:
# As per the submission in the previous component, we will proceed with the Trimming approach on the data

diabetes_final = diabetes_clean[diabetes_clean['anomaly_score'] == 1]

diabetes_final.drop(columns = ['anomaly_score'], inplace=True)

diabetes_final

Unnamed: 0,Insulin,Glucose,Outcome
0,111.777741,148.0,1.0
1,37.395513,85.0,0.0
2,153.162493,183.0,1.0
3,94.000000,89.0,0.0
4,168.000000,137.0,1.0
...,...,...,...
763,180.000000,101.0,0.0
764,81.145108,122.0,0.0
765,112.000000,121.0,0.0
766,85.764469,126.0,1.0


In [27]:
from sklearn.model_selection import train_test_split

X = diabetes_final.drop(columns = ['Outcome'])
Y = diabetes_final['Outcome']

X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.1, random_state = 768)

len (X_train), len (X_test), len (Y_train), len (Y_test)

(540, 60, 540, 60)

## Re-checking the accuracy

## (1) Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=140)

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report = X_test.copy()
report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Logistic Regression", "accuracy :", round(accuracy_score (report['actual'], report['predicted']), 2))

Logistic Regression accuracy : 0.8


## (2) K Nearest Neighbors

In [29]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report = X_test.copy()
report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("K Nearest Neighbors", "accuracy :", round(accuracy_score (report['actual'], report['predicted']), 2))

K Nearest Neighbors accuracy : 0.7


## (3) Decision Tree

In [30]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report = X_test.copy()
report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Decision Tree", "accuracy :", round(accuracy_score (report['actual'], report['predicted']), 2))

Decision Tree accuracy : 0.88


## (4) Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, Y_train)

prediction = model.predict (X_test)

report = X_test.copy()
report['actual'] = Y_test
report['predicted'] = model.predict(X_test)

#report

print ("Random Forest", "accuracy :", round(accuracy_score (report['actual'], report['predicted']), 2))

Random Forest accuracy : 0.88


### As we can see that after Outlier treatment, the accuracy score of K Nearest Neighbors has slightly reduced (from 71% to 68%) while the accuracy scores of Logistic Regression, Decision Tree and Random Forest and improved significantly.


### After Outlier Treatment, Random Forest is showing the highest accuracy (~83%)