In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, recall_score
from scipy.stats.mstats import winsorize

import warnings
warnings.filterwarnings('ignore')

In [14]:
diabetes_dataset = pd.read_csv('../data/processed/diabetes_processed.csv')

In [15]:
diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(6), int64(3)
memory usage: 54.1 KB


In [16]:
diabetes_dataset['Outcome'] = diabetes_dataset['Outcome'].astype('category')

In [24]:
diabetes_dataset.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.128143,0.211023,0.09354,0.090865,0.024238,-0.030387,0.591044,0.219811
Glucose,0.128143,1.0,0.219096,0.1822,0.44582,0.231648,0.128805,0.278461,0.493591
BloodPressure,0.211023,0.219096,1.0,0.201205,0.093471,0.277476,-0.001992,0.340563,0.165723
SkinThickness,0.09354,0.1822,0.201205,1.0,0.193076,0.558895,0.086463,0.143083,0.215067
Insulin,0.090865,0.44582,0.093471,0.193076,1.0,0.195018,0.080879,0.17261,0.248705
BMI,0.024238,0.231648,0.277476,0.558895,0.195018,1.0,0.14712,0.06163,0.312567
DiabetesPedigreeFunction,-0.030387,0.128805,-0.001992,0.086463,0.080879,0.14712,1.0,0.043969,0.17995
Age,0.591044,0.278461,0.340563,0.143083,0.17261,0.06163,0.043969,1.0,0.274247
Outcome,0.219811,0.493591,0.165723,0.215067,0.248705,0.312567,0.17995,0.274247,1.0


In [None]:
# Log1p transform for highly skewed features
for col in ['Insulin', 'DiabetesPedigreeFunction', 'Age']:
    diabetes_dataset[col] = np.log1p(diabetes_dataset[col])

# Winsorize for mild skewness
for col in ['Pregnancies', 'Glucose', 'SkinThickness', 'BMI']:
    diabetes_dataset[col] = winsorize(diabetes_dataset[col], limits=[0.01, 0.01])

In [None]:
# VIF Calculation and Feature Dropping
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data['Feature'] = df.columns
    vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

def drop_high_vif_features(df, threshold=10.0):
    dropped_features = []
    while True:
        vif = calculate_vif(df)
        max_vif = vif['VIF'].max()
        if max_vif > threshold:
            drop_feature = vif.sort_values('VIF', ascending=False)['Feature'].iloc[0]
            print(f"Dropping '{drop_feature}' with VIF={max_vif:.2f}")
            df = df.drop(columns=[drop_feature])
            dropped_features.append(drop_feature)
        else:
            break
    print("Remaining features:\n", df.columns.tolist())
    return df, dropped_features

# Usage:
X = diabetes_dataset.drop(columns=['Outcome'])
X_reduced, dropped = drop_high_vif_features(X, threshold=10.0)
print("Dropped features:", dropped)
print("Final VIFs:\n", calculate_vif(X_reduced))

# Too many important features were dropped; reverting to original feature set

Dropping 'Age' with VIF=111.70
Dropping 'Insulin' with VIF=51.61
Dropping 'BMI' with VIF=35.55
Dropping 'BloodPressure' with VIF=21.81
Dropping 'Glucose' with VIF=11.24
Remaining features:
 ['Pregnancies', 'SkinThickness', 'DiabetesPedigreeFunction']
Dropped features: ['Age', 'Insulin', 'BMI', 'BloodPressure', 'Glucose']
Final VIFs:
                     Feature       VIF
0               Pregnancies  2.233485
1             SkinThickness  4.792555
2  DiabetesPedigreeFunction  3.756332


In [18]:
#Seperating the features and target variable
X = diabetes_dataset.drop('Outcome', axis=1)
y = diabetes_dataset['Outcome']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

In [21]:
LR = LogisticRegression().fit(X_train_norm,y_train)

In [22]:
# Make predictions
y_pred = LR.predict(X_test_norm)
y_proba = LR.predict_proba(X_test_norm)[:, 1]  # Probability estimates for ROC-AUC

In [23]:
# Evaluate the model
roc_auc = roc_auc_score(y_test, y_proba)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

ROC-AUC: 0.797
Recall: 0.588
F1 Score: 0.606


In [36]:
# Training a Lasso Regression Model for comparison
lasso_lr = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
lasso_lr.fit(X_train_norm, y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'l1'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",0.1
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'liblinear'


In [37]:
# Get feature names
feature_names = X.columns

# Get coefficients
coefficients = lasso_lr.coef_[0]

# Create a Series for easy inspection
coef_series = pd.Series(coefficients, index=feature_names)

# Display all coefficients
print("All coefficients:")
print(coef_series)

# Display only features with zero coefficients
zero_coef_features = coef_series[coef_series == 0]
print("\nFeatures with zero coefficients (excluded by Lasso):")
print(zero_coef_features)

All coefficients:
Pregnancies                 0.021978
Glucose                     0.976595
BloodPressure               0.000000
SkinThickness               0.000000
Insulin                     0.015962
BMI                         0.551424
DiabetesPedigreeFunction    0.046735
Age                         0.390977
dtype: float64

Features with zero coefficients (excluded by Lasso):
BloodPressure    0.0
SkinThickness    0.0
dtype: float64
