# imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Read data

In [None]:
data=pd.read_csv("/kaggle/input/diabetes-prediction-dataset/diabetes_prediction_dataset.csv")

In [None]:
data.head(5)

In [None]:
data.shape

# Data Preprocessing

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
print(data.columns)

In [None]:
data_adj=pd.get_dummies(data,columns=["gender","smoking_history"])

In [None]:
data_adj.info()

# visualizations

In [None]:
correlation_matrix = data_adj.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='twilight', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Visualize age distribution
plt.figure(figsize=(10, 6))
sns.histplot(data_adj['age'], bins=30, kde=True, color='skyblue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualize hypertension and heart disease counts
plt.figure(figsize=(10, 6))
sns.countplot(x='hypertension', data=data_adj, palette='pastel')
plt.title('Hypertension Count')
plt.xlabel('Hypertension')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='heart_disease', data=data_adj, palette='pastel')
plt.title('Heart Disease Count')
plt.xlabel('Heart Disease')
plt.ylabel('Count')
plt.show()

In [None]:
# Visualize BMI distribution
plt.figure(figsize=(10, 6))
sns.histplot(data_adj['bmi'], bins=30, kde=True, color='salmon')
plt.title('BMI Distribution')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualize HbA1c level distribution
plt.figure(figsize=(10, 6))
sns.histplot(data_adj['HbA1c_level'], bins=30, kde=True, color='green')
plt.title('HbA1c Level Distribution')
plt.xlabel('HbA1c Level')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Visualize blood glucose level distribution
plt.figure(figsize=(10, 6))
sns.histplot(data_adj['blood_glucose_level'], bins=30, kde=True, color='orange')
plt.title('Blood Glucose Level Distribution')
plt.xlabel('Blood Glucose Level')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Visualize gender distribution
gender_column = 'gender_Female' 
gender_labels = ['Female', 'Male']
gender_counts = [data_adj[gender_column].sum(), data_adj['gender_Male'].sum()]  # Count female and male separately
plt.figure(figsize=(10, 6))
sns.barplot(x=gender_labels, y=gender_counts, palette='pastel')
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
# Visualize smoking history distribution
smoking_columns = ['smoking_history_No Info', 'smoking_history_current', 'smoking_history_ever', 
                   'smoking_history_former', 'smoking_history_never', 'smoking_history_not current']  # Adjusted column names
smoking_labels = ['No Info', 'Current', 'Ever', 'Former', 'Never', 'Not Current']
smoking_counts = [data_adj[column].sum() for column in smoking_columns]
plt.figure(figsize=(10, 6))
sns.barplot(x=smoking_labels, y=smoking_counts, palette='pastel')
plt.title('Smoking History Distribution')
plt.xlabel('Smoking History')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Visualize diabetes count
plt.figure(figsize=(10, 6))
sns.countplot(x='diabetes', data=data_adj, palette='pastel')
plt.title('Diabetes Count')
plt.xlabel('Diabetes')
plt.ylabel('Count')
plt.show()

In [None]:
# Select relevant columns for the pairplot (numerical variables)
numerical_columns = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

# Add the target variable (diabetes) to see its relationship with other variables
pairplot_data = data_adj[numerical_columns + ['diabetes']]

# Create pairplot
sns.pairplot(pairplot_data, hue='diabetes', diag_kind='kde', palette='viridis')
plt.suptitle('Pairplot of Numerical Variables with Diabetes')
plt.show()

In [None]:
# Boxplot for numerical columns
numerical_columns = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
for column in numerical_columns:
    sns.boxplot(x=column, data=data_adj)
    plt.show()


# Split the data

In [None]:
X=data_adj.drop(columns=['diabetes'])
y=data_adj['diabetes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM Model

In [None]:
# Train the SVM model
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(X_train, y_train)

# Make predictions
y_pred = svm.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# LogisticRegression Model

In [None]:

model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

# RandomForestClassifier

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# XGBClassifierxgb Model

In [None]:
xgb_classifier = xgb.XGBClassifier(objective="multi:softmax", num_class=3, random_state=42)

# Train the classifier
xgb_classifier.fit(X_train, y_train)

# Make predictions
y_pred = xgb_classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))