<a href="https://colab.research.google.com/github/Nathan2605/regularized-linear-regression-project-tutorial/blob/main/proyecto_regresion_lineal_regularizada_NH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tableone



In [17]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns

from tableone import TableOne, load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold

In [3]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


In [4]:
df.duplicated().sum() #no tenemos duplicados

0

In [5]:
df.isnull().sum().sort_values(ascending=False) #no tenemos valores nulos

Unnamed: 0,0
fips,0
Total nurse practitioners (2019),0
STATE_NAME,0
COUNTY_NAME,0
Percent of Population Aged 60+,0
...,...
POP_ESTIMATE_2018,0
% Two or more races,0
Two or more races pop,0
% Hawaiian/PI-alone,0


In [6]:
[col for col in df.columns if df[col].nunique() == 1] #no tenemos columnas constantes

[]

In [7]:
def is_binary(df_, nums):
    df = df_.copy()
    variables = []
    for var in nums:
        flag = True
        unique = df_[var].unique()
        for value in unique:
            if value not in [0, 1, np.nan, 0.0, 1.0]:
                flag = False
        if flag == True:
            variables.append(var)
    return variables

def breakdown_vars(df, off_binary=False):
    """
    This function allow us categorize accodign to numerical or not
    """
    binaries = is_binary(df, df.columns)
    categorial = []
    nonormal = []
    normal = []
    for t in df.columns:
        if off_binary == False:
          if (df[t].dtypes.name=="object" or df[t].dtypes.name=='category') and  t not in binaries:
            categorial.append(t)
        else:
           if (df[t].dtypes.name=="object" or df[t].dtypes.name=='category'):
            categorial.append(t)
        if (df[t].dtypes=="int64" or df[t].dtypes=="float64") and t not in binaries:
                n,p = stats.shapiro(df[t])
                if p<0.05:
                    nonormal.append(t)
                else:
                    normal.append(t)
    if off_binary == False:
      return categorial, binaries, nonormal, normal
    else:
      return categorial, nonormal, normal

def normalize(array):
  return (array - array.mean())/array.std()


def clean(df):
    # tener en cuenta...
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

In [8]:
#categorizar variables
cat, binaries, nonormal, normal = breakdown_vars(df, off_binary=False)

In [9]:
#eliminamos variables numericas
df.drop(columns = cat, inplace=True)

In [10]:
#target
target = 'Heart disease_number'

X = df.drop(target, axis=1)
y = df[target]

In [11]:
#split
X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True,
                                                    test_size=0.20,
                                                    random_state = 42)

In [12]:
int((len(df.columns)-1) * 0.3)

31

In [13]:
#selecion de variables - SelectKBest
k = int(len(X_train.columns) * 0.3)
selector = SelectKBest(score_func = f_regression, k = k)
selector.fit(X_train, y_train)
bool_selected = selector.get_support()

In [14]:
X_train_sel = pd.DataFrame(selector.transform(X_train), columns = X_train.columns.values[bool_selected])
X_test_sel = pd.DataFrame(selector.transform(X_test), columns = X_test.columns.values[bool_selected])

X_train_sel

Unnamed: 0,TOT_POP,0-9,19-Oct,20-29,30-39,40-49,50-59,60-69,70-79,80+,...,Family Medicine/General Practice Primary Care (2019),Total Specialist Physicians (2019),Total Population,Population Aged 60+,county_pop2018_18 and older,anycondition_number,Obesity_number,COPD_number,diabetes_number,CKD_number
0,26625.0,3221.0,3463.0,2922.0,2829.0,2831.0,3831.0,3860.0,2460.0,1208.0,...,16.042791,54.078829,26317.0,7218.0,20545.0,10213.0,7930.0,2314.0,2823.0,771.0
1,51266.0,5272.0,5751.0,5137.0,5341.0,5880.0,7885.0,8271.0,4801.0,2928.0,...,28.837240,107.146919,52880.0,15253.0,41304.0,19282.0,13713.0,4097.0,5416.0,1454.0
2,37779.0,3915.0,5118.0,6202.0,4363.0,4451.0,5107.0,4349.0,2713.0,1561.0,...,18.759998,49.149727,37543.0,8058.0,30379.0,14283.0,10238.0,2792.0,3698.0,871.0
3,91984.0,11163.0,12646.0,11595.0,11357.0,11444.0,12774.0,10791.0,6736.0,3478.0,...,52.721196,108.754962,90098.0,19479.0,70506.0,33057.0,25735.0,5716.0,7913.0,2118.0
4,134487.0,16698.0,17666.0,17281.0,15993.0,15845.0,17982.0,16909.0,10074.0,6039.0,...,81.034623,273.160542,134327.0,31227.0,103686.0,49754.0,38882.0,10002.0,12987.0,3490.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2507,42555.0,4478.0,6520.0,6989.0,4822.0,4879.0,5836.0,5062.0,2641.0,1328.0,...,23.980141,67.321537,41226.0,8070.0,33797.0,12731.0,11018.0,1914.0,2699.0,838.0
2508,56031.0,7624.0,7993.0,7669.0,7740.0,7959.0,7241.0,5615.0,2919.0,1271.0,...,23.824826,80.954251,52322.0,8436.0,41923.0,20561.0,15595.0,3631.0,4670.0,1094.0
2509,33443.0,4680.0,4568.0,4778.0,4199.0,3763.0,4355.0,3608.0,2322.0,1170.0,...,12.860545,53.774588,33750.0,6942.0,25066.0,13265.0,9776.0,3111.0,3999.0,1013.0
2510,5795.0,331.0,493.0,335.0,379.0,540.0,986.0,1312.0,938.0,481.0,...,3.491755,11.770397,6072.0,2710.0,5059.0,2660.0,1836.0,661.0,816.0,229.0


###modelo regresion lineal regularizada

In [16]:
from sklearn.linear_model import Lasso

alpha = 1.0
lasso_model = Lasso(alpha = alpha)

#fi
lasso_model.fit(X_train, y_train)

#rendimiento
score = lasso_model.score(X_test, y_test)
print("Coefficients:", lasso_model.coef_)
print("R2 score:", score)

Coefficients: [-6.03606217e-02  2.49869806e-02 -1.41245026e-02  3.12092145e+01
 -5.54367409e-02  5.11814640e+01 -9.07989929e-03  6.84471246e+01
 -1.93061209e-03  3.54563545e+01  1.15370137e-01  1.18347444e+01
  5.01339592e-02  3.47587722e+01 -4.24778706e-02  4.15147921e+01
  1.24162183e-01 -4.02174650e+01  1.21333237e-01 -2.80899979e+01
 -2.34292374e-02  1.50114390e+01 -2.55746419e-02  1.89852778e+01
 -1.06336000e-02  8.65376212e+00 -3.33774205e-02  5.83633859e+01
  6.46265110e-02  4.49547341e+00 -5.70655131e-02  4.86770635e-01
  6.69193914e-03  9.23225785e-02  1.05237979e-02 -1.11893063e+00
  0.00000000e+00 -5.87681583e+00  1.48784760e+01 -4.05379023e+00
 -0.00000000e+00  4.06942454e-03  2.77973014e-02 -1.90646952e-02
  1.21293234e-03  6.65088015e+00 -6.88829347e+00  1.02233896e+01
 -6.28907263e+00  6.67517370e-02 -2.67292157e+01 -6.57097060e-01
  2.53971451e+00 -7.79026750e-02 -1.11789647e-03  2.05625611e-03
 -1.23175484e-02  4.71850867e-04 -2.62692459e-01  4.10988294e+01
  7.8458632

  model = cd_fast.enet_coordinate_descent(


###optimizar

In [18]:
def grid_lasso(X_train, y_train):
    model = Lasso(random_state=1000)
    alpha = np.array([0.01, 0.1, 1])
    grid = dict(alpha = alpha)
    cv = KFold(n_splits=5, shuffle=True, random_state=10)
    grid_search = GridSearchCV(estimator=model,
                               param_grid=grid,
                               cv=cv,
                               scoring=mean_squared_error)
    grid_result = grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

In [19]:
#Lasso optimizado
best_model = grid_lasso(X_train, y_train) #entreno el modelo con X_train y y_train
preds = best_model.predict(X_test)

#rendimiento
score = best_model.score(X_test, y_test)
print("R2 score:", score)

  model = cd_fast.enet_coordinate_descent(
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 191, in wrapper
    params = func_sig.bind(*args, **kwargs)
  File "/usr/lib/python3.10/inspect.py", line 3186, in bind
    return self._bind(args, kwargs)
  File "/usr/lib/python3.10/inspect.py", line 3112, in _bind
    raise TypeError(
TypeError: too many positional arguments

  model = cd_fast.enet_coordinate_descent(
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 191, in wrapper
    params = func_sig.bi

R2 score: 0.9968833807701991


  model = cd_fast.enet_coordinate_descent(
