<a href="https://www.kaggle.com/code/khaledyasser4/end-to-end-ml?scriptVersionId=243697103" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<a href="https://www.kaggle.com/code/khaledyasser4/end-to-end-ml?scriptVersionId=243697103" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from scipy import stats
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")



# Read Dataset

In [None]:
data= pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

data.head()


In [None]:
data.info()
data.shape

In [None]:
data.describe()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='Blues')
plt.title('Correlation Matrix')
plt.show()

In [None]:
sns.pairplot(data, hue='Outcome', vars=['Glucose', 'BMI', 'Age', 'Insulin'], palette='Set2')
plt.suptitle('Pair Plot of Key Features by Outcome', y=1.02)
plt.show()

# Outliers

In [None]:
plt.figure(figsize=(12, 8))
data_melt = data.melt(id_vars='Outcome', var_name='Feature')
sns.boxplot(x='Feature', y='value', hue='Outcome', data=data_melt, palette='Set3')
plt.title('Box Plot of Features by Outcome')
plt.xticks(rotation=45)
plt.show()

# treat outliers

In [None]:
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data[col] = data[col].clip(lower=lower_bound, upper=upper_bound)

plt.figure(figsize=(12, 8))
data_melt = data.melt(id_vars='Outcome', var_name='Feature')
sns.boxplot(x='Feature', y='value', hue='Outcome', data=data_melt, palette='Set3')
plt.title('Box Plot of Features by Outcome (After Outlier Handling)')
plt.xticks(rotation=45)
plt.show()

# Mising values

In [None]:
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    data[col] = data[col].replace(0, np.nan)
    data[col].fillna(data[col].mean(), inplace=True)

# Data split

In [None]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Models

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=42,),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(max_iter=1000, random_state=42)
}

In [None]:
model_results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    model_results[name] = {'y_pred': y_pred, 'accuracy': accuracy, 'report': report}

In [None]:
results = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': []}
for name in model_results:
    results['Model'].append(name)
    results['Accuracy'].append(model_results[name]['accuracy'])
    results['Precision'].append(model_results[name]['report']['1']['precision'])
    results['Recall'].append(model_results[name]['report']['1']['recall'])
    results['F1-Score'].append(model_results[name]['report']['1']['f1-score'])
    

In [None]:
print("\n(Accuracy):")
for name, result in model_results.items():
    accuracy = result['accuracy']
    print(f"{name}: Accuracy = {accuracy:.2f}")

In [None]:
results_df = pd.DataFrame(results)
    
plt.figure(figsize=(10, 6))
sns.barplot(x='Accuracy', y='Model', data=results_df ,palette='viridis' )
plt.title('Model Accuracy Comparison')
plt.xlabel('Accuracy')
plt.ylabel('Model')
plt.show()

In [None]:
y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
if y_prob is not None:
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 5))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {name}')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

In [None]:
initial_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    initial_results[name] = {'y_pred': y_pred, 'accuracy': accuracy, 'report': report}

# Hyperparameter Tuning

In [None]:

param_grids = {
    'XGBoost': {
        'n_estimators': [1000, 2000],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    },
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear']
    },
    'Neural Network': {
        'hidden_layer_sizes': [ (50, 50)],
        'alpha': [0.0001, 0.1]
    }
}

best_models = {}
for name, model in models.items():
    if name in param_grids:
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_models[name] = grid_search.best_estimator_
    else:
        best_models[name] = model

tuned_results = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    tuned_results[name] = {'y_pred': y_pred, 'accuracy': accuracy, 'report': report}



In [None]:
comparison_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Accuracy (Before Tuning)': [initial_results[m]['accuracy'] for m in models.keys()],
    'Accuracy (After Tuning)': [tuned_results[m]['accuracy'] for m in models.keys()]
})
plt.figure(figsize=(12, 6))
comparison_df_melted = pd.melt(comparison_df, id_vars='Model', var_name='Stage', value_name='Accuracy')
sns.barplot(data=comparison_df_melted, x='Accuracy', y='Model', hue='Stage')
plt.title('Model Accuracy Before vs After Tuning')
plt.show()
