**IMPORT MODULES AND LIBRARIES**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**READ THE DATA**

In [2]:
data=pd.read_csv("/content/diabetes_prediction_dataset.csv")

In [3]:
data.head(5)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


**SHAPE OF DATA**

In [4]:
data.shape

(100000, 9)

**CHECK FOR NULL VALUES**

In [5]:
data.isnull()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
99995,False,False,False,False,False,False,False,False,False
99996,False,False,False,False,False,False,False,False,False
99997,False,False,False,False,False,False,False,False,False
99998,False,False,False,False,False,False,False,False,False


In [6]:
data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

**SUM OF TOTAL NULL VALUES**

In [7]:
data.isnull().sum().sum()

0

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [9]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


**CORRELATION OF THE DATA**

In [10]:
data.corr()

  data.corr()


Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
age,1.0,0.251171,0.233354,0.337396,0.101354,0.110672,0.258008
hypertension,0.251171,1.0,0.121262,0.147666,0.080939,0.084429,0.197823
heart_disease,0.233354,0.121262,1.0,0.061198,0.067589,0.070066,0.171727
bmi,0.337396,0.147666,0.061198,1.0,0.082997,0.091261,0.214357
HbA1c_level,0.101354,0.080939,0.067589,0.082997,1.0,0.166733,0.40066
blood_glucose_level,0.110672,0.084429,0.070066,0.091261,0.166733,1.0,0.419558
diabetes,0.258008,0.197823,0.171727,0.214357,0.40066,0.419558,1.0


**SEPARATE CATEGORICAL AND NUMERICAL FEATURES**

In [11]:
numerical_features= [i for i in data.columns if data[i].dtype!='O']
categorical_features= [i for i in data.columns if data[i].dtype=='O']

**NUMERICAL FEATURES**

In [12]:
numerical_features

['age',
 'hypertension',
 'heart_disease',
 'bmi',
 'HbA1c_level',
 'blood_glucose_level',
 'diabetes']

**CATEGORICAL FEATURES**

In [13]:
categorical_features

['gender', 'smoking_history']

In [14]:
data['gender'].value_counts()

Female    58552
Male      41430
Other        18
Name: gender, dtype: int64

In [15]:
data['smoking_history'].value_counts()

No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: smoking_history, dtype: int64

**IMPORT LABEL ENCODER TO CONVERT CATEGORICAL VALUES TO NUMERICAL**

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
l=LabelEncoder()

In [18]:
a=l.fit_transform(data['gender'])

In [19]:
b=l.fit_transform(data['smoking_history'])

**DROP THE EXISTING COLUMNS**

In [20]:
data.drop(labels='gender', axis=1,inplace=True)

In [21]:
data.drop(labels='smoking_history', axis=1,inplace=True)

**ADD NEW COLUMNS AS FEATURES**

In [22]:
data['gender']=a

In [23]:
data['smoking_history']=b

**CHECK FOR DUPLICATED VALUES**

In [24]:
data[data.duplicated()]

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender,smoking_history
2756,80.0,0,0,27.32,6.6,159,0,1,0
3272,80.0,0,0,27.32,3.5,80,0,0,0
3418,19.0,0,0,27.32,6.5,100,0,0,0
3939,78.0,1,0,27.32,3.5,130,0,0,3
3960,47.0,0,0,27.32,6.0,200,0,1,0
...,...,...,...,...,...,...,...,...,...
99980,52.0,0,0,27.32,6.1,145,0,0,4
99985,25.0,0,0,27.32,5.8,145,0,1,0
99989,26.0,0,0,27.32,5.0,158,0,0,0
99990,39.0,0,0,27.32,6.1,100,0,1,0


**DROP DUPLICATED VALUES TO AVOID OVERFITTING**

In [25]:
data=data.drop_duplicates()

In [26]:
data.shape

(96146, 9)

In [27]:
data.head(5)

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender,smoking_history
0,80.0,0,1,25.19,6.6,140,0,0,4
1,54.0,0,0,27.32,6.6,80,0,0,0
2,28.0,0,0,27.32,5.7,158,0,1,4
3,36.0,0,0,23.45,5.0,155,0,0,1
4,76.0,1,1,20.14,4.8,155,0,1,1


In [28]:
data.columns

Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes', 'gender', 'smoking_history'],
      dtype='object')

**CREATE DATAFRAMES X AND Y**

In [29]:
X=data[['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level','gender', 'smoking_history']]
Y=data['diabetes']

In [30]:
Y.shape

(96146,)

**CONVERT 1D ARRAY TO 2D ARRAY**

In [31]:
Y.values.reshape(-1,1)

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [32]:
from sklearn.model_selection import train_test_split

**SPLIT THE DATA INTO TRAIN AND TEST**

In [33]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20,random_state=1)

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

**IMPORT RANDOMFORESTCLASSIFIER**

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
r = make_pipeline(StandardScaler(), RandomForestClassifier())

In [37]:
y=r.fit(X_train,Y_train)

In [38]:
Y_pred=r.predict(X_test)

**ACCURACY OF THE MODEL**

In [39]:
y.score(X_test,Y_test)

0.9660946437857514

**CONFUSION MATRIX**

In [42]:
print(confusion_matrix(Y_test, Y_pred))

[[17421    68]
 [  584  1157]]


**CLASSIFICATION REPORT**

In [40]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     17489
           1       0.94      0.66      0.78      1741

    accuracy                           0.97     19230
   macro avg       0.96      0.83      0.88     19230
weighted avg       0.97      0.97      0.96     19230

