<a href="https://colab.research.google.com/github/MoriamAkterSwarna/AI-ML/blob/main/Diabetes_Dataset_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#sklearn preprocessing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


#Regression model

from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor



from sklearn.ensemble import VotingRegressor, StackingRegressor


#metrices

from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

import warnings
warnings.filterwarnings("ignore")


In [None]:
df = pd. read_csv("/content/diabetes_binary_health_indicators_BRFSS2015.csv")


In [None]:
df

# `Y DATA Profiling `

In [None]:
!pip install ydata-profiling

In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Diabetes Profiling Report", explorative = True)

profile.to_file("Diabetes_Profiling_Report.html")

In [None]:
df.columns

In [None]:
len(df.columns)

In [None]:
len(df)

# Correlation

In [None]:
corr_target = df.select_dtypes(include=np.number).corr()['Diabetes_binary'].sort_values(ascending=False)
print(corr_target)

# **Separate X and y**

In [None]:
X = df.drop("Diabetes_binary", axis =1)
y = df["Diabetes_binary"]

# **Pipeline**

In [None]:
#for numerical features

num_transformer = Pipeline (
    steps = [
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

In [None]:
#combine them
preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, X.columns)
    ]
)

## Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state=42)

# `Ensemble -> Boosting, Stacking`

In [None]:
# base learner

lr = LinearRegression()
rf = RandomForestRegressor( n_estimators=100, random_state= 42)
gb = GradientBoostingRegressor( n_estimators=100, random_state= 42)
# xgb = XGBRegressor( )

# `Voting Regressor`

In [None]:
voting_reg = VotingRegressor(
    estimators = [
        ('lr',lr),
        ('rf',rf),
        ('gb',gb),
        # ('xgb',xgb)
    ]
)

# `Stacking Regressor`

In [None]:
stacking_res = StackingRegressor(
    estimators = [
        ('lr',lr),
        ('rf',rf),
        ('gb',gb),
        # ('xgb',xgb)
    ],
     final_estimator= Ridge()
)

# `Model Training`

In [None]:
# All Model

model_to_train = {
    "voting_reg":voting_reg,
    "stacking_res":stacking_res,
    "lr":lr,
    "rf":rf,
    "gb":gb,
    # "xgb":xgb
}


# `Training and Evaluation`

In [None]:
result = []

for name, model in model_to_train.items():
  pipe= Pipeline(
      [
          ('preprocessor', preprocessor),
          ('model',model)
      ]

  )

  # train

  pipe.fit(X_train,y_train)

  # predict

  y_pred = pipe.predict(X_test)

  # Evaluate:

  r2 = r2_score(y_test,y_pred)
  mae = mean_absolute_error(y_test,y_pred)
  rmse = np.sqrt(mean_squared_error(y_test,y_pred))

  result.append({
      "model_name":name,
      "r2_score":r2,
      "mean_absolute_error":mae,
      "root_mean_squared_error":rmse
  })

res_df = pd.DataFrame(result).sort_values("r2_score", ascending=False)
res_df

In [None]:
best_model_name = res_df.iloc[0]['Model']
best_model_obj = model_to_train[best_model_name]


#fit the best model

final_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model',best_model_obj)
])

final_pipe.fit(X_train,y_train)
y_final_pred = final_pipe.predict(X_test)

In [None]:
#plot Actual vs predicted

plt.figure( figsize = (8,6) )

sns.scatterplot(x=y_test, y=y_final_pred, alpha = 0.6, color='teal' )
plt.plot( [2,5] , [2,5], color = "red", linestyle = '--'  )

plt.xlabel("Actual Diabetes State")
plt.ylabel("Predicted Diabetes State")

plt.grid(True)
plt.show()