# OLS Linear Regression Model built using the diabetes dataset.


###### Let's start by importing the libraries that will be used for the OLS Regression of the diabetes dataset.  

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import datasets

###### We will first load the publicly available diabetes dataset and then print it to get a better understanding of what is contained in the dataset.

###### We will build a linear regression model using the diabetes dataset to predict the 'target' field.

#### Diabetes Dataset Overview

There are ten baseline variables
  age, sex, body mass index, average blood pressure, and six blood serum measurements
The ten baseline variables were collected for each of the 442 diabetes patients (n), as well as the response of interest, a quantitative measure of disease progression one year after baseline.

Dataset Characteristics
Number of Instances: 442
Number of Attributes
First 10 columns are numeric predictive values
Target: Column 11 is a quantitative measure of disease progression one year after baseline
Attribute Information
 - age:     age in years <br>
  - sex:     sex <br>
  - bmi:     body mass index <br>
  - bp:      average blood pressure <br>
  - s1:      tc, total serum cholesterol <br>
  - s2:      ldl, low-density lipoproteins <br>
  - s3:      hdl, high-density lipoproteins <br>
  - s4:      tch, total cholesterol / HDL <br>
  - s5:      ltg, possibly log of serum triglycerides level <br>
  - s6:      glu, blood sugar level <br>
Note: The dataset includes the 10 features that have been mean centered and scaled by the standard deviation times n_samples (i.e. the sum of squares of each column totals 1).

Additional information can be found here:
  Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499. (https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)


Source URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html <br>
Data URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt <br>
Note: The Data URL is from the source URL. The source URL provides detailed information about the dataset, variables and also reference links including the dataset link.



##### Read in data into a dataframe then print the dataframe head.

In [22]:
#The Data URL above is read into a pandas dataframe called "df".
#Pandas can use the sep="\t" when reading in the csv file since data is separated by tabs instead of another character.
df = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep = '\t')
print (df)

     AGE  SEX   BMI      BP   S1     S2    S3    S4      S5   S6    Y
0     59    2  32.1  101.00  157   93.2  38.0  4.00  4.8598   87  151
1     48    1  21.6   87.00  183  103.2  70.0  3.00  3.8918   69   75
2     72    2  30.5   93.00  156   93.6  41.0  4.00  4.6728   85  141
3     24    1  25.3   84.00  198  131.4  40.0  5.00  4.8903   89  206
4     50    1  23.0  101.00  192  125.4  52.0  4.00  4.2905   80  135
..   ...  ...   ...     ...  ...    ...   ...   ...     ...  ...  ...
437   60    2  28.2  112.00  185  113.8  42.0  4.00  4.9836   93  178
438   47    2  24.9   75.00  225  166.0  42.0  5.00  4.4427  102  104
439   60    2  24.9   99.67  162  106.6  43.0  3.77  4.1271   95  132
440   36    1  30.0   95.00  201  125.2  42.0  4.79  5.1299   85  220
441   36    1  19.6   71.00  250  133.2  97.0  3.00  4.5951   92   57

[442 rows x 11 columns]


###### Basic Field Information to Explore Data

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AGE     442 non-null    int64  
 1   SEX     442 non-null    int64  
 2   BMI     442 non-null    float64
 3   BP      442 non-null    float64
 4   S1      442 non-null    int64  
 5   S2      442 non-null    float64
 6   S3      442 non-null    float64
 7   S4      442 non-null    float64
 8   S5      442 non-null    float64
 9   S6      442 non-null    int64  
 10  Y       442 non-null    int64  
dtypes: float64(6), int64(5)
memory usage: 38.1 KB


###### Convert sex to a categorical variable

In [24]:
# We need to convert the sex of the person to a categorical variable
categorical_var = ['SEX']
df[categorical_var] = df[categorical_var].astype('category')


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   AGE     442 non-null    int64   
 1   SEX     442 non-null    category
 2   BMI     442 non-null    float64 
 3   BP      442 non-null    float64 
 4   S1      442 non-null    int64   
 5   S2      442 non-null    float64 
 6   S3      442 non-null    float64 
 7   S4      442 non-null    float64 
 8   S5      442 non-null    float64 
 9   S6      442 non-null    int64   
 10  Y       442 non-null    int64   
dtypes: category(1), float64(6), int64(4)
memory usage: 35.2 KB


###### Next, examine the dataframe

In [26]:
#Panda's describe function is used to see how the dataframe looks.
#dfDescription is set equal to include = "all" parameters for the columnes where the statistic is inappropriate for the datatype.
dfDescription = df.describe(include="all")
print (dfDescription)

               AGE    SEX         BMI          BP          S1          S2  \
count   442.000000  442.0  442.000000  442.000000  442.000000  442.000000   
unique         NaN    2.0         NaN         NaN         NaN         NaN   
top            NaN    1.0         NaN         NaN         NaN         NaN   
freq           NaN  235.0         NaN         NaN         NaN         NaN   
mean     48.518100    NaN   26.375792   94.647014  189.140271  115.439140   
std      13.109028    NaN    4.418122   13.831283   34.608052   30.413081   
min      19.000000    NaN   18.000000   62.000000   97.000000   41.600000   
25%      38.250000    NaN   23.200000   84.000000  164.250000   96.050000   
50%      50.000000    NaN   25.700000   93.000000  186.000000  113.000000   
75%      59.000000    NaN   29.275000  105.000000  209.750000  134.500000   
max      79.000000    NaN   42.200000  133.000000  301.000000  242.400000   

                S3          S4          S5          S6           Y  
count 

###### We want to split the dataframe into train and test sub datasets. This will allow us to train and test an OLS liner regression model.

In [27]:
# sklearn.model_selection train_test_split() function is used to split the dataset.
# The train dataset is 70% and the test dataset (test_size) is 30% (0.3)
# The random_state is set to 42 in order to produce repeatable results with the same random_state
# The train and tests sets are called "df_train" and "df_test".
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

###### Fit Multilinear OLS regression model using training dataset and save the result in 'est_train' variable.
Print model summary

In [28]:
# The multilinear OLS regression model is fit to the training dataset (df_train) and the results are printed in the table below.
est_train = ols(formula="Y ~ AGE + SEX + BMI + S1 + S2 + S3 + S4 + S5 + S6", data=df_train).fit()
print(est_train.summary())

                            OLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.485
Model:                            OLS   Adj. R-squared:                  0.469
Method:                 Least Squares   F-statistic:                     31.23
Date:                Tue, 30 Jul 2024   Prob (F-statistic):           2.82e-38
Time:                        00:51:37   Log-Likelihood:                -1683.9
No. Observations:                 309   AIC:                             3388.
Df Residuals:                     299   BIC:                             3425.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -295.5125     81.152     -3.641      0.0

###### Extract non significant coef (p< .05: SEX + BMI + S3 + S5), rerun model.

In [29]:
# We need to exract the non-significant coefficients because the OLS Regression analysis above.
# A new model is trained using the training dataset (df_train) with the SEX, BMI, S3, and S5 variables to determine Y
est_train = ols(formula="Y ~ SEX + BMI + S3 + S5", data=df_train).fit()
print(est_train.params)

Intercept   -176.648928
SEX[T.2]     -17.185273
BMI            7.377660
S3            -1.065873
S5            41.824183
dtype: float64


###### The OOS Out of Sample R-Squared value will help us determine how the model did on the test dataset. The trained model is used on the test dataset and then measure the R^2

In [30]:
# r2 is set to the results of the out of sample r^2 for the linear regression model.
test_prediction = est_train.predict(df_test)
r2 = r2_score(df_test['Y'], test_prediction)

print('OOS R-squared: '+ str(r2))


OOS R-squared: 0.48511853284845097
