# Data Preparation

In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
    df = pd.read_csv('diabetes_prediction_dataset.csv')

In [3]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
df.shape

(100000, 9)

In [5]:
df['gender'].nunique()

3

In [6]:
unique_values = df['gender'].unique()
print(unique_values)

['Female' 'Male' 'Other']


In [7]:
df['gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [8]:
df['smoking_history'].nunique()

6

In [9]:
df['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [10]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

Duplicated Values

In [11]:
df.duplicated().sum()

3854

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [13]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [14]:
clean_df = df.drop_duplicates()

In [15]:
clean_df.shape

(96146, 9)

In [19]:
clean_df.apply(lambda col: col.drop_duplicates().reset_index(drop=True))

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0.0,80.0,0.0,1.0,4.0,25.19,6.6,140.0,0.0
1,1.0,54.0,1.0,0.0,0.0,27.32,5.7,80.0,1.0
2,2.0,28.0,,,1.0,23.45,5.0,158.0,
3,,36.0,,,3.0,20.14,4.8,155.0,
4,,76.0,,,2.0,19.31,6.5,85.0,
...,...,...,...,...,...,...,...,...,...
4242,,,,,,52.97,,,
4243,,,,,,12.26,,,
4244,,,,,,59.42,,,
4245,,,,,,44.39,,,


In [18]:
# Conver the category variable to numerical using LabelEncoder
label_encoder = {}
for column in clean_df.select_dtypes(include=['object']).columns:
    label_encoder[column] = LabelEncoder()
    clean_df[column] = label_encoder[column].fit_transform(clean_df[column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df[column] = label_encoder[column].fit_transform(clean_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df[column] = label_encoder[column].fit_transform(clean_df[column])


In [None]:
clean_df.head()

In [None]:
df['diabetes'].nunique()

In [None]:
# Feature selection
features = df.columns[:-1] # Last column bahek sabailai select garchha [:-1] le
target = df.columns[-1] # Last ko column matra select garchha

In [None]:
# EDA
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Checking the distribution of the target variable
plt.figure(figsize=(6,4))
sns.countplot(x=target, data=df)
plt.title('Distribution of Target Variable')
plt.show()

In [None]:
# Distribution of numerical features
numerical_features = clean_df.select_dtypes(include=['int64', 'float64']).columns
clean_df[numerical_features].hist(figsize=(15, 10))
plt.suptitle("Histogram of Numerical Features")
plt.show()

In [None]:
# Pairplot to see relationships between features
sns.pairplot(df[numerical_features])
plt.suptitle('Pairplot of Numerical Features')
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = clean_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Boxplot to check for ouliers
plt.figure(figsize=(15, 10))
for i, column in enumerate(numerical_features, 1):
    plt.subplot(3,3,i)
    sns.boxplot(y=df[column])
    plt.title(f'Boxplot of {column}')
plt.tight_layout()
plt.show()