# Data Modeling

## Importing Data

In [47]:
import pandas as pd
df = pd.read_csv('final_df (2).csv')
df.head()

Unnamed: 0,Year,Quarter,State,Pct_Telehealth,Real_GDP,Total_Population,Total_Male_Population%,Total_Female_Population%,Population_Under5%,Population5_17%,...,Black,Hispanic,Asian,American Indian or Alaska Native,Multiple Races,Region,pos,neu,neg,compound
0,2020,1,Alabama,0.0554,222288.8,4903185,48.3,51.7,5.8,16.3,...,0.265,0.044,0.014,0.004,0.019,South,0.0965,0.7927,0.1109,-0.0145
1,2020,1,Alaska,0.0758,50332.8,731545,52.0,48.0,7.0,17.6,...,0.022,0.07,0.06,0.151,0.083,West,0.034,0.8955,0.0705,-0.18579
2,2020,1,Arizona,0.0617,365027.7,7278717,49.7,50.3,5.9,16.7,...,0.043,0.318,0.033,0.039,0.024,Southwest,0.0764,0.8259,0.0977,-0.16932
3,2020,1,Arkansas,0.0472,128340.9,3017804,48.9,51.1,6.1,17.1,...,0.152,0.078,0.016,0.006,0.024,South,0.0742,0.837,0.0889,0.0296
4,2020,1,California,0.0915,2933320.2,39512223,49.7,50.3,6.0,16.5,...,0.053,0.395,0.147,0.004,0.033,West,0.121,0.8126,0.0665,0.07139


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Year                               816 non-null    int64  
 1   Quarter                            816 non-null    int64  
 2   State                              816 non-null    object 
 3   Pct_Telehealth                     816 non-null    float64
 4   Real_GDP                           816 non-null    float64
 5   Total_Population                   816 non-null    int64  
 6   Total_Male_Population%             816 non-null    float64
 7   Total_Female_Population%           816 non-null    float64
 8   Population_Under5%                 816 non-null    float64
 9   Population5_17%                    816 non-null    float64
 10  Population18_24%                   816 non-null    float64
 11  Population25_34%                   816 non-null    float64

In [49]:
df.isna().sum()

Unnamed: 0,0
Year,0
Quarter,0
State,0
Pct_Telehealth,0
Real_GDP,0
Total_Population,0
Total_Male_Population%,0
Total_Female_Population%,0
Population_Under5%,0
Population5_17%,0


In [50]:
y_train = df['Pct_Telehealth']
y_train.nunique()

716

In [51]:
# Create a boolean mask for the condition
mask = (df['Year'] == 2023) & ((df['Quarter'] == 3) | (df['Quarter'] == 4))

# Create the new dataset
df_2023_Q4 = df[mask].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Remove the rows from the original dataset
df = df[~mask]

In [53]:
df.drop(columns = ['State'], inplace = True)
df_2023_Q4.drop(columns = ['State'], inplace = True)

In [55]:
df = pd.get_dummies(df, columns = ['Region']).astype(float)
df_2023_Q4 = pd.get_dummies(df_2023_Q4, columns = ['Region']).astype(float)

In [57]:
from sklearn.model_selection import train_test_split
X = df.drop(columns = ['Pct_Telehealth'])
y = df['Pct_Telehealth']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
y_train.nunique()

514

## XGBoost

In [60]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb = XGBRegressor()
param_grid = {
    "learning_rate" : [0.01, 0.1, 0.2],
    "max_depth" : [3, 6, 9],
    "n_estimators" : [50, 100, 150]
}

grid_search_xgb = GridSearchCV(xgb, param_grid, cv = 5, n_jobs = -1)
grid_search_xgb.fit(X_train, y_train)

In [61]:
print(grid_search_xgb.best_params_)

{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150}


In [62]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predict on the test set using the best estimator
y_pred = grid_search_xgb.best_estimator_.predict(X_test)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

R-squared: 0.9164977427731283
RMSE: 0.03724891764466899


## Regression

In [59]:
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Print the shapes before proceeding
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

# Add a constant to the independent variables (only X_train and X_test)
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

# Print the shapes of the transformed data
print(f"Shape of X_train_sm: {X_train_sm.shape}")
print(f"Shape of X_test_sm: {X_test_sm.shape}")

# Fit the linear regression model
model = sm.OLS(y_train, X_train_sm).fit()

# Print the model summary
print(model.summary())

# Make predictions on the test set
y_pred_sm = model.predict(X_test_sm)

# Evaluate the model
r2_sm = r2_score(y_test, y_pred_sm)
rmse_sm = np.sqrt(mean_squared_error(y_test, y_pred_sm))

print(f"Linear Regression R-squared: {r2_sm}")
print(f"Linear Regression RMSE: {rmse_sm}")


Shape of X_train: (571, 40)
Shape of X_test: (143, 40)
Shape of X_train_sm: (571, 41)
Shape of X_test_sm: (143, 41)
                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.461
Model:                            OLS   Adj. R-squared:                  0.422
Method:                 Least Squares   F-statistic:                     11.96
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           5.89e-50
Time:                        20:10:27   Log-Likelihood:                 639.71
No. Observations:                 571   AIC:                            -1201.
Df Residuals:                     532   BIC:                            -1032.
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025     

In [63]:
column_drop_list = ['Population_Under5%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.461
Model:                            OLS   Adj. R-squared:                  0.423
Method:                 Least Squares   F-statistic:                     12.30
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           1.68e-50
Time:                        20:19:10   Log-Likelihood:                 639.70
No. Observations:                 571   AIC:                            -1203.
Df Residuals:                     533   BIC:                            -1038.
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [64]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.461
Model:                            OLS   Adj. R-squared:                  0.424
Method:                 Least Squares   F-statistic:                     12.67
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           4.71e-51
Time:                        20:20:12   Log-Likelihood:                 639.70
No. Observations:                 571   AIC:                            -1205.
Df Residuals:                     534   BIC:                            -1045.
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [65]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.461
Model:                            OLS   Adj. R-squared:                  0.425
Method:                 Least Squares   F-statistic:                     13.05
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           1.31e-51
Time:                        20:20:50   Log-Likelihood:                 639.68
No. Observations:                 571   AIC:                            -1207.
Df Residuals:                     535   BIC:                            -1051.
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [66]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.460
Model:                            OLS   Adj. R-squared:                  0.426
Method:                 Least Squares   F-statistic:                     13.45
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           3.71e-52
Time:                        20:21:23   Log-Likelihood:                 639.62
No. Observations:                 571   AIC:                            -1209.
Df Residuals:                     536   BIC:                            -1057.
Df Model:                          34                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [67]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%', 'Region_West']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.460
Model:                            OLS   Adj. R-squared:                  0.426
Method:                 Least Squares   F-statistic:                     13.45
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           3.71e-52
Time:                        20:22:02   Log-Likelihood:                 639.62
No. Observations:                 571   AIC:                            -1209.
Df Residuals:                     536   BIC:                            -1057.
Df Model:                          34                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [68]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%', 'Region_West', 'Population55_64%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.460
Model:                            OLS   Adj. R-squared:                  0.427
Method:                 Least Squares   F-statistic:                     13.88
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           1.03e-52
Time:                        20:22:29   Log-Likelihood:                 639.57
No. Observations:                 571   AIC:                            -1211.
Df Residuals:                     537   BIC:                            -1063.
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [69]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.460
Model:                            OLS   Adj. R-squared:                  0.428
Method:                 Least Squares   F-statistic:                     14.33
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           2.94e-53
Time:                        20:22:58   Log-Likelihood:                 639.47
No. Observations:                 571   AIC:                            -1213.
Df Residuals:                     538   BIC:                            -1069.
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [70]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.460
Model:                            OLS   Adj. R-squared:                  0.429
Method:                 Least Squares   F-statistic:                     14.81
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           8.57e-54
Time:                        20:23:22   Log-Likelihood:                 639.33
No. Observations:                 571   AIC:                            -1215.
Df Residuals:                     539   BIC:                            -1076.
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [71]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%', 'Private_Health_Insurance%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.460
Model:                            OLS   Adj. R-squared:                  0.430
Method:                 Least Squares   F-statistic:                     15.32
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           2.22e-54
Time:                        20:23:50   Log-Likelihood:                 639.30
No. Observations:                 571   AIC:                            -1217.
Df Residuals:                     540   BIC:                            -1082.
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [72]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.460
Model:                            OLS   Adj. R-squared:                  0.431
Method:                 Least Squares   F-statistic:                     15.86
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           6.34e-55
Time:                        20:24:19   Log-Likelihood:                 639.14
No. Observations:                 571   AIC:                            -1218.
Df Residuals:                     541   BIC:                            -1088.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [73]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.459
Model:                            OLS   Adj. R-squared:                  0.431
Method:                 Least Squares   F-statistic:                     16.45
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           1.65e-55
Time:                        20:24:41   Log-Likelihood:                 639.07
No. Observations:                 571   AIC:                            -1220.
Df Residuals:                     542   BIC:                            -1094.
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [74]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.459
Model:                            OLS   Adj. R-squared:                  0.432
Method:                 Least Squares   F-statistic:                     17.07
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           4.59e-56
Time:                        20:25:10   Log-Likelihood:                 638.90
No. Observations:                 571   AIC:                            -1222.
Df Residuals:                     543   BIC:                            -1100.
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [75]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.459
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     17.75
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           1.15e-56
Time:                        20:25:32   Log-Likelihood:                 638.82
No. Observations:                 571   AIC:                            -1224.
Df Residuals:                     544   BIC:                            -1106.
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [76]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%', 'Real_GDP']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.458
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     18.45
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           3.58e-57
Time:                        20:26:00   Log-Likelihood:                 638.49
No. Observations:                 571   AIC:                            -1225.
Df Residuals:                     545   BIC:                            -1112.
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [77]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%', 'Real_GDP', 'Region_South']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.458
Model:                            OLS   Adj. R-squared:                  0.434
Method:                 Least Squares   F-statistic:                     19.20
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           1.07e-57
Time:                        20:26:23   Log-Likelihood:                 638.18
No. Observations:                 571   AIC:                            -1226.
Df Residuals:                     546   BIC:                            -1118.
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [78]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%', 'Real_GDP', 'Region_South', 'American Indian or Alaska Native']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.457
Model:                            OLS   Adj. R-squared:                  0.434
Method:                 Least Squares   F-statistic:                     20.01
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           3.42e-58
Time:                        20:26:52   Log-Likelihood:                 637.77
No. Observations:                 571   AIC:                            -1228.
Df Residuals:                     547   BIC:                            -1123.
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [79]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%', 'Real_GDP', 'Region_South', 'American Indian or Alaska Native', 'Total_Population']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.456
Model:                            OLS   Adj. R-squared:                  0.435
Method:                 Least Squares   F-statistic:                     20.92
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           9.42e-59
Time:                        20:27:11   Log-Likelihood:                 637.50
No. Observations:                 571   AIC:                            -1229.
Df Residuals:                     548   BIC:                            -1129.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [80]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%', 'Real_GDP', 'Region_South', 'American Indian or Alaska Native', 'Total_Population',
                    'compound']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.456
Model:                            OLS   Adj. R-squared:                  0.435
Method:                 Least Squares   F-statistic:                     21.88
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           2.95e-59
Time:                        20:27:45   Log-Likelihood:                 637.07
No. Observations:                 571   AIC:                            -1230.
Df Residuals:                     549   BIC:                            -1135.
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [81]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%', 'Real_GDP', 'Region_South', 'American Indian or Alaska Native', 'Total_Population',
                    'compound', 'Public_Health_Insurance_Coverage%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.454
Model:                            OLS   Adj. R-squared:                  0.434
Method:                 Least Squares   F-statistic:                     22.88
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           1.27e-59
Time:                        20:28:08   Log-Likelihood:                 636.27
No. Observations:                 571   AIC:                            -1231.
Df Residuals:                     550   BIC:                            -1139.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [82]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%', 'Real_GDP', 'Region_South', 'American Indian or Alaska Native', 'Total_Population',
                    'compound', 'Public_Health_Insurance_Coverage%', 'Some_College_or_Associate Degree%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.452
Model:                            OLS   Adj. R-squared:                  0.434
Method:                 Least Squares   F-statistic:                     23.96
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           5.67e-60
Time:                        20:28:33   Log-Likelihood:                 635.41
No. Observations:                 571   AIC:                            -1231.
Df Residuals:                     551   BIC:                            -1144.
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

In [83]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%', 'Real_GDP', 'Region_South', 'American Indian or Alaska Native', 'Total_Population',
                    'compound', 'Public_Health_Insurance_Coverage%', 'Some_College_or_Associate Degree%', 'Graduate_Or_Professional_Degree%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.452
Model:                            OLS   Adj. R-squared:                  0.434
Method:                 Least Squares   F-statistic:                     25.30
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           1.33e-60
Time:                        20:28:52   Log-Likelihood:                 635.21
No. Observations:                 571   AIC:                            -1232.
Df Residuals:                     552   BIC:                            -1150.
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         

In [84]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%', 'Real_GDP', 'Region_South', 'American Indian or Alaska Native', 'Total_Population',
                    'compound', 'Public_Health_Insurance_Coverage%', 'Some_College_or_Associate Degree%', 'Graduate_Or_Professional_Degree%',
                    'Povery_Rate%']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.451
Model:                            OLS   Adj. R-squared:                  0.434
Method:                 Least Squares   F-statistic:                     26.71
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           4.40e-61
Time:                        20:29:51   Log-Likelihood:                 634.62
No. Observations:                 571   AIC:                            -1233.
Df Residuals:                     553   BIC:                            -1155.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         

In [85]:
column_drop_list = ['Population_Under5%', 'UnemploymentRate%', 'Quarter', 'Population65_74%',
                    'Region_West', 'Population55_64%', 'Less_Than_High_School_Diploma%', 'Population5_17%',
                    'Private_Health_Insurance%', 'Population18_24%', 'Population35_44%', 'Region_Midwest',
                    'Population_Over75%', 'Real_GDP', 'Region_South', 'American Indian or Alaska Native', 'Total_Population',
                    'compound', 'Public_Health_Insurance_Coverage%', 'Some_College_or_Associate Degree%', 'Graduate_Or_Professional_Degree%',
                    'Povery_Rate%', 'Asian']

## let's create the logistic regression model
model = sm.OLS(y_train, sm.add_constant(X_train_sm.drop(columns = column_drop_list, axis = 1))).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                       0.448
Model:                            OLS   Adj. R-squared:                  0.432
Method:                 Least Squares   F-statistic:                     28.12
Date:                Sun, 08 Dec 2024   Prob (F-statistic):           3.17e-61
Time:                        20:30:19   Log-Likelihood:                 633.17
No. Observations:                 571   AIC:                            -1232.
Df Residuals:                     554   BIC:                            -1158.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         

In [93]:
## save model
from joblib import dump
dump(model, 'linear_model.joblib')

['linear_model.joblib']

In [95]:
variables = [
    'Year',
    'Total_Male_Population%',
    'Total_Female_Population%',
    'Population25_34%',
    'Population45_54%',
    'High_School_Graduate%',
    'Bachelor_Degree%',
    'No_Health_Insurance_Coverage%',
    'White',
    'Black',
    'Hispanic',
    'Multiple Races',
    'pos',
    'neu',
    'neg',
    'Region_Northeast',
    'Region_Southwest'
]

correlation_df = df[variables]

correlation_matrix = correlation_df.corr()

correlation_matrix.style.background_gradient(cmap='Blues')

Unnamed: 0,Year,Total_Male_Population%,Total_Female_Population%,Population25_34%,Population45_54%,High_School_Graduate%,Bachelor_Degree%,No_Health_Insurance_Coverage%,White,Black,Hispanic,Multiple Races,pos,neu,neg,Region_Northeast,Region_Southwest
Year,1.0,0.117238,-0.117238,-0.075272,-0.187795,-0.091759,0.17533,-0.15666,-0.059902,-0.019308,0.033702,0.306579,0.103693,-0.169013,0.131015,0.0,-0.0
Total_Male_Population%,0.117238,1.0,-1.0,-0.06271,-0.434723,-0.030205,0.111393,0.169533,0.23406,-0.735515,0.055611,0.288648,0.040433,-0.06441,0.039889,-0.434677,0.061205
Total_Female_Population%,-0.117238,-1.0,1.0,0.06271,0.434723,0.030205,-0.111393,-0.169533,-0.23406,0.735515,-0.055611,-0.288648,-0.040433,0.06441,-0.039889,0.434677,-0.061205
Population25_34%,-0.075272,-0.06271,0.06271,1.0,-0.227683,-0.638957,0.360742,-0.083044,-0.421975,0.285793,0.230732,0.0982,0.011951,-0.011512,-0.005212,0.102527,0.03792
Population45_54%,-0.187795,-0.434723,0.434723,-0.227683,1.0,0.123775,-0.046074,-0.00643,-0.158313,0.199678,0.11441,-0.139768,-0.03231,0.12329,-0.124235,0.260501,-0.125344
High_School_Graduate%,-0.091759,-0.030205,0.030205,-0.638957,0.123775,1.0,-0.749804,0.139987,0.433818,-0.053201,-0.454382,-0.099231,-0.039315,0.043733,-0.011903,-0.1707,-0.073581
Bachelor_Degree%,0.17533,0.111393,-0.111393,0.360742,-0.046074,-0.749804,1.0,-0.416551,-0.043889,-0.167075,0.087957,0.14376,0.039425,-0.004022,-0.029534,0.407396,-0.201463
No_Health_Insurance_Coverage%,-0.15666,0.169533,-0.169533,-0.083044,-0.00643,0.139987,-0.416551,1.0,-0.206395,0.113791,0.314451,-0.169395,-0.072836,-0.010313,0.054333,-0.496503,0.481181
White,-0.059902,0.23406,-0.23406,-0.421975,-0.158313,0.433818,-0.043889,-0.206395,1.0,-0.41577,-0.648907,-0.394011,0.070464,-0.059927,0.018183,0.005996,-0.330781
Black,-0.019308,-0.735515,0.735515,0.285793,0.199678,-0.053201,-0.167075,0.113791,-0.41577,1.0,-0.129103,-0.178495,-0.075319,0.037732,0.016952,0.124998,-0.130498
