In [None]:
from sklearn import datasets
data = datasets.load_diabetes(return_X_y=False,as_frame=True)
#Exploring the Data:
print(data.data.head())
features_name=data.feature_names
print(features_name)

In [None]:
import pandas as pd
#remove duplications in features
duplication=data.data.duplicated().value_counts()
duplication_sum=data.data.duplicated().sum()
new_data=data.data.iloc[data.data.duplicated(keep='last').index]
features=data.data
target=data.target
duplication
new_data

In [None]:
#check number of samples and check null values:
features.info()

In [None]:
#check the null on the label column:
target.info()

In [None]:
#do satatitical analysis to find the mean , min , and max values :
num_features=['age', 'bmi', 'bp', 's1', 's2', 's3', 's5', 's6']
cat_features=['sex','s4']
Diabetes=features.copy()
Diabetes['target']=target
Diabetes[num_features].describe()

In [None]:
#Check Outliers by using boxplot :
from matplotlib import pyplot as plt

type(features)
for col in num_features:
    Diabetes.boxplot(column=col, figsize=(6,6))
    plt.title(col)
    plt.show()

In [None]:
#plot the distripution of each numarical feature :
#the data is normalized between (-0.1,0.1)
import matplotlib.pyplot as plt

Diabetes[num_features].hist(bins=50,figsize=(12,8))
plt.show()

In [None]:
# extra code – this cell generates Figure 2–17
import numpy as np
import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 2, figsize=(8, 3), sharey=True)
Diabetes["bmi"].hist(ax=axs[0], bins=50)
Diabetes["bmi"].apply(np.log).hist(ax=axs[1], bins=50)
axs[0].set_xlabel("pmi")
axs[1].set_xlabel("Log of pmi")
axs[0].set_ylabel("Number of diabetes")
plt.show()

In [None]:
import pandas as pd
percentiles = [np.percentile(Diabetes["bmi"], p)
               for p in range(0,10)]
flattened_median_income = pd.cut(Diabetes["bmi"],
                                 bins=[-np.inf] + percentiles + [np.inf],
                                 labels=range(0, 10 + 1))
flattened_median_income.hist(bins=1)
plt.xlabel("Median income percentile")
plt.ylabel("Number of districts")
plt.show()

In [None]:

for feature in features_name:
    Diabetes.plot(kind='scatter',x=feature,y='target',grid=True)
    plt.show()

In [None]:
from pandas.plotting import scatter_matrix
attributes=['target','bmi','bp','s1','s2','s3','s5','s6','age']
scatter_matrix(Diabetes[attributes],figsize=(12,8))
plt.show()


#there is a high correlation between the two features s1, s2 (not good) to solve that we can use PCA

In [None]:
correlation=Diabetes[attributes].corr()
correlation['target'].sort_values(ascending=False)
#the target is highly correlated with the most of the num features


In [None]:
correlation['s1'].sort_values(ascending=False)#there is ahigh correlation between the two 
#features(s1,s2 and also s5) we can solve this by applying PCA

In [None]:
from sklearn.model_selection import train_test_split
X=features
y=target
Xtrain,Xtest,ytrain,ytest=train_test_split( X,y, test_size=0.2,stratify=features["sex"],random_state=42)
Xtrain.shape

In [None]:
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import rbf_kernel
#Apply PCA to solve the high correlation between features
pca = PCA(n_components=6)
Xtrain_num = pca.fit_transform(features[num_features])


In [None]:
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn .utils.validation import check_array,check_is_fitted
from sklearn.cluster import KMeans
#apply clustering similarity for the S4 feature
class ClusterSimilarity(BaseEstimator,TransformerMixin):
    def __init__(self,n_clusters=6,gamma=0.1,random_state=None):
        self.n_clusters=n_clusters
        self.gamma=gamma
        self.random_state=random_state

    def fit(self,X,sample_weight=None,y=None):
        X=check_array(X)
        self.kmeans_=KMeans(n_clusters=self.n_clusters,random_state=self.random_state)
        self.kmeans_.fit(X,sample_weight=sample_weight)
        self.n_features_in_ = X.shape[1]
        return self

    def transform(self,X):
        check_is_fitted(self)
        X=check_array(X)
        assert self.n_features_in_==X.shape[1]
        
        return rbf_kernel(X,self.kmeans_.cluster_centers_,gamma=self.gamma) 

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [None]:
import numpy as np
cluster_simil = ClusterSimilarity(n_clusters=6, gamma=1., random_state=42)
similarities = cluster_simil.fit_transform(np.array(np.array(Diabetes['s4']).reshape(-1, 1)),
                                           sample_weight=target)
cluster_simil.get_feature_names_out()

In [None]:
# plot a boxplot for the label by each categorical feature
categorical_features=['sex','s4']
for col in categorical_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    Diabetes.boxplot(column = 'target', by = col, ax = ax)
    ax.set_title('Label by ' + col)
    ax.set_ylabel("Diabets")
plt.show()

In [None]:
#from sklearn.ensemble import IsolationForest
#
#outlier_pred=IsolationForest(random_state=42)
#out=outlier_pred.fit_predict(features)
#Diabetes = Diabetes.iloc[out == 1]
#target = target.iloc[out == 1]
#Diabetes.shape


In [None]:
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR 
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.compose import ColumnTransformer,make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

results=[] 
reg={'LinearRegression':LinearRegression(),
    'KNeighborsRegressor':KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(random_state=42),
    'RandomForestRegressor':RandomForestRegressor(random_state=42,max_features=10,criterion='poisson'),
    'LinearSVR':LinearSVR(random_state=42),
    'SVR':  SVR()}


num_pipline=make_pipeline(PCA(n_components=6))
s4_cluster_simil=make_pipeline(ClusterSimilarity(n_clusters=6,gamma=.1,random_state=42))

sex_cluster_simil=make_pipeline(ClusterSimilarity(n_clusters=2,gamma=.1,random_state=42))

preprocessing=ColumnTransformer([('num', num_pipline, num_features),
                                ('sex',sex_cluster_simil,['sex']),
                                ('s4',s4_cluster_simil,['s4'])])

#data_prepared=preprocessing.fit_transform(Xtrain)
#data_prepared.shape



In [None]:

for key in reg.keys():
    full_pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('reg',reg[key]),
    ])
    score=-cross_val_score(full_pipeline, Xtrain, ytrain, scoring="neg_root_mean_squared_error", cv=10)
    results.append((key,score.mean()/(np.max(target)-np.min(target))))
print('models scores:',results)
best_model_idx=np.array(results)[:,1].argmin()
print('best model:',results[best_model_idx][0],results[best_model_idx][1])

In [None]:
#**Transformation Pipelines:**
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor

out_pipeline = TransformedTargetRegressor(LinearRegression(fit_intercept=False),
                                   transformer=StandardScaler())
full_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('linear_regression',out_pipeline),
])
full_pipeline.fit(Xtrain,ytrain)
#LinearRegression().get_params()
param_grid=[{'preprocessing__num__pca__n_components': [3,4,5,6,7,8],
              'preprocessing__s4__clustersimilarity__n_clusters':[2,3,4,5,6],
              'preprocessing__s4__clustersimilarity__gamma':[.01,.1]
            }]

grid_search=GridSearchCV(full_pipeline,param_grid,cv=10,scoring='neg_root_mean_squared_error')
grid_search.fit(Xtrain,ytrain)
print('best estimator=',grid_search.best_estimator_)
print('best score=',grid_search.best_score_)
rmse=-cross_val_score(full_pipeline,Xtrain,ytrain,scoring='neg_root_mean_squared_error',cv=10)
rmse.mean().round(1)
rmse_percentage=rmse.mean().round(1)/(target.max()-target.min())
print('rmse percentage:',rmse_percentage)

In [None]:
preprocessing.get_feature_names_out()

In [None]:
from sklearn.metrics import mean_squared_error

final_model=grid_search.best_estimator_
final_prediction=final_model.predict(Xtest)
final_rmse=mean_squared_error(y_pred=final_prediction,y_true=ytest,squared=False)
final_rmse
final_rmse_percentage=final_rmse.round(1)/(target.max()-target.min())
print('final rmse percentage:',final_rmse_percentage)
#train the best model in the whole data set including the train and test datat set.
final_model.fit(features,target)


In [None]:
final_prediction=final_model.predict(Xtest)
final_rmse=mean_squared_error(y_pred=final_prediction,y_true=ytest,squared=False)
final_rmse
final_rmse_percentage=final_rmse.round(1)/(target.max()-target.min())
print('final rmse percentage:',final_rmse_percentage)

In [None]:
#Save the final Model:
import joblib

joblib.dump(final_model,'final_model.pkl')

In [None]:
import joblib
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel



final_model=joblib.load('final_model.pkl')

new_data=features.iloc[:5]
predictions=final_model.predict(new_data)
predictions

In [None]:
target.iloc[:5]

In [None]:
#the end :)

In [None]:
#trying to use randomized search to fine tune hyper_parameters
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


rnd_param={
    'preprocessing__num__pca__n_components': randint(low=2,high=6),
    'preprocessing__s4__clustersimilarity__n_clusters':randint(low=2,high=6),
    'preprocessing__s4__clustersimilarity__gamma':randint(low=.01,high=1)
}
rnd_search= RandomizedSearchCV(full_pipeline,param_distributions=rnd_param,n_iter=10,cv=3,
                                scoring='neg_root_mean_squared_error',random_state=42)
rnd_search.fit(Xtrain , ytrain)

rnd_search.best_estimator_


In [None]:
rnd_search.best_score_


In [None]:
# try using alternative models : (XGBRegressor)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
import xgboost as xgb

full_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('XGB_regression',xgb.XGBRegressor(verbosity=0)),
])
full_pipeline.fit(Xtrain,ytrain)
#LinearRegression().get_params()
param_grid=[{'preprocessing__num__pca__n_components': [3,4,5,6],
              'preprocessing__s4__clustersimilarity__n_clusters':[2,3,4,5,6,7],
              'preprocessing__s4__clustersimilarity__gamma':[.01,.1]
            }
            ]

grid_search=GridSearchCV(full_pipeline,param_grid,cv=10,scoring='neg_root_mean_squared_error')
grid_search.fit(Xtrain,ytrain)
print('best estimator=',grid_search.best_estimator_)
print('best score=',grid_search.best_score_)
rmse=-cross_val_score(full_pipeline,Xtrain,ytrain,scoring='neg_root_mean_squared_error',cv=10)
rmse.mean().round(1)
rmse_percentage=rmse.mean().round(1)/(target.max()-target.min())
print('rmse percentage:',rmse_percentage)

In [None]:
grid_search.best_params_

In [None]:
#try using voting regressor for the highest score estimators:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor

#**Transformation Pipelines:**
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
import xgboost as xgb
results=[] 
reg={'LinearRegression':LinearRegression(),
    'GradientBoostingRegressor':GradientBoostingRegressor(random_state=42),
    #'RandomForestRegressor':RandomForestRegressor(max_features=10,n_estimators=100,random_state=42),
    }
    
for key in reg.keys():
    full_pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('reg',reg[key]),
    ])
    reg[key].fit(Xtrain,ytrain)

ereg = VotingRegressor([("gb", reg['GradientBoostingRegressor']), #("rf", reg['RandomForestRegressor']), 
                        ("lr", reg['LinearRegression'])],weights=[1,2,1])
ereg.fit(Xtrain, ytrain)

mse=-cross_val_score(ereg,Xtest,ytest,scoring='neg_root_mean_squared_error',cv=10)
rmse.mean().round(1)
rmse_percentage=rmse.mean().round(1)/(target.max()-target.min())
print('rmse percentage:',rmse_percentage)

print('the rmse of voting regressor is worse!')
print('best regressor is linear regression')


In [None]:
rmse.mean()

In [None]:
#after trying altenative models we conclude that the best model to fit the data is : 
#LinearRegression with RMSE =.17

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, valid_scores = learning_curve(
    final_model, X, y, train_sizes=np.linspace(0.01, 1.0, 350), cv=10,
    scoring="neg_root_mean_squared_error")
train_errors = train_scores.mean(axis=1)
valid_errors = valid_scores.mean(axis=1)

plt.figure(figsize=(6, 4))  # extra code – not needed, just formatting
plt.plot(train_sizes, train_errors, "r-+", linewidth=2, label="train")
plt.plot(train_sizes, valid_errors, "b-", linewidth=3, label="valid")

# extra code – beautifies and saves Figure 4–15
plt.xlabel("Training set size")
plt.ylabel("RMSE")
plt.grid()
plt.legend(loc="upper right")
plt.axis([0, 400, 0, 150])
plt.show()
#valid_errors