# Data Modeling

## Importing Data

In [68]:
import pandas as pd
df = pd.read_csv('final_df.csv')
df.head()

Unnamed: 0,Year,Quarter,State,Pct_Telehealth,Real_GDP,Total_Population,Total_Male_Population%,Total_Female_Population%,Population_Under5%,Population5_17%,...,Black,Hispanic,Asian,American Indian or Alaska Native,Multiple Races,Region,pos,neu,neg,compound
0,2020,1,Alabama,0.0554,222288.8,4903185,48.3,51.7,5.8,16.3,...,0.265,0.044,0.014,0.004,0.019,South,0.0965,0.7927,0.1109,-0.0145
1,2020,1,Alaska,0.0758,50332.8,731545,52.0,48.0,7.0,17.6,...,0.022,0.07,0.06,0.151,0.083,West,0.034,0.8955,0.0705,-0.18579
2,2020,1,Arizona,0.0617,365027.7,7278717,49.7,50.3,5.9,16.7,...,0.043,0.318,0.033,0.039,0.024,Southwest,0.0764,0.8259,0.0977,-0.16932
3,2020,1,Arkansas,0.0472,128340.9,3017804,48.9,51.1,6.1,17.1,...,0.152,0.078,0.016,0.006,0.024,South,0.0742,0.837,0.0889,0.0296
4,2020,1,California,0.0915,2933320.2,39512223,49.7,50.3,6.0,16.5,...,0.053,0.395,0.147,0.004,0.033,West,0.121,0.8126,0.0665,0.07139


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Year                               816 non-null    int64  
 1   Quarter                            816 non-null    int64  
 2   State                              816 non-null    object 
 3   Pct_Telehealth                     816 non-null    float64
 4   Real_GDP                           816 non-null    float64
 5   Total_Population                   816 non-null    int64  
 6   Total_Male_Population%             816 non-null    float64
 7   Total_Female_Population%           816 non-null    float64
 8   Population_Under5%                 816 non-null    float64
 9   Population5_17%                    816 non-null    float64
 10  Population18_24%                   816 non-null    float64
 11  Population25_34%                   816 non-null    float64

In [70]:
df.isna().sum()

Unnamed: 0,0
Year,0
Quarter,0
State,0
Pct_Telehealth,0
Real_GDP,0
Total_Population,0
Total_Male_Population%,0
Total_Female_Population%,0
Population_Under5%,0
Population5_17%,0


In [71]:
# Create a boolean mask for the condition
mask = (df['Year'] == 2023) & ((df['Quarter'] == 3) | (df['Quarter'] == 4))

# Create the new dataset
df_2023_Q4 = df[mask].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Remove the rows from the original dataset
df = df[~mask]

In [72]:
df.drop(columns = ['State'], inplace = True)
df_2023_Q4.drop(columns = ['State'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns = ['State'], inplace = True)


In [73]:
df = pd.get_dummies(df, columns = ['Region']).astype(int)
df_2023_Q4 = pd.get_dummies(df_2023_Q4, columns = ['Region']).astype(int)

In [74]:
## X matrix and y vector
X_train = df.drop(columns = ['Pct_Telehealth'])
y_train = df['Pct_Telehealth']

X_test = df_2023_Q4.drop(columns = ['Pct_Telehealth'])
y_test = df_2023_Q4['Pct_Telehealth']

## XGBoost

In [75]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb = XGBRegressor()
param_grid = {
    "learning_rate" : [0.01, 0.1, 0.2],
    "max_depth" : [3, 6, 9],
    "n_estimators" : [50, 100, 150]
}

grid_search_xgb = GridSearchCV(xgb, param_grid, cv = 5, n_jobs = -1)
grid_search_xgb.fit(X_train, y_train)

In [76]:
print(grid_search_xgb.best_params_)

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}


In [77]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predict on the test set using the best estimator
y_pred = grid_search_xgb.best_estimator_.predict(X_test)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

R-squared: 1.0
RMSE: 0.0


## Regression

In [101]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 102 entries, 714 to 815
Data columns (total 40 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   Year                               102 non-null    int64
 1   Quarter                            102 non-null    int64
 2   Real_GDP                           102 non-null    int64
 3   Total_Population                   102 non-null    int64
 4   Total_Male_Population%             102 non-null    int64
 5   Total_Female_Population%           102 non-null    int64
 6   Population_Under5%                 102 non-null    int64
 7   Population5_17%                    102 non-null    int64
 8   Population18_24%                   102 non-null    int64
 9   Population25_34%                   102 non-null    int64
 10  Population35_44%                   102 non-null    int64
 11  Population45_54%                   102 non-null    int64
 12  Population55_64%         

In [92]:
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Print the shapes before proceeding
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

# Add a constant to the independent variables (only X_train and X_test)
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

# Print the shapes of the transformed data
print(f"Shape of X_train_sm: {X_train_sm.shape}")
print(f"Shape of X_test_sm: {X_test_sm.shape}")

# Fit the linear regression model
model = sm.OLS(y_train, X_train_sm).fit()

# Print the model summary
print(model.summary())

# Make predictions on the test set
y_pred_sm = model.predict(X_test_sm)

# Evaluate the model
r2_sm = r2_score(y_test, y_pred_sm)
rmse_sm = np.sqrt(mean_squared_error(y_test, y_pred_sm))

print(f"Linear Regression R-squared: {r2_sm}")
print(f"Linear Regression RMSE: {rmse_sm}")


Shape of X_train: (714, 40)
Shape of X_test: (102, 40)
Shape of X_train_sm: (714, 41)
Shape of X_test_sm: (102, 40)
                            OLS Regression Results                            
Dep. Variable:         Pct_Telehealth   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Sun, 08 Dec 2024   Prob (F-statistic):                nan
Time:                        19:53:48   Log-Likelihood:                    inf
No. Observations:                 714   AIC:                              -inf
Df Residuals:                     684   BIC:                              -inf
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025     

  return 1 - self.ssr/self.centered_tss
  return self.mse_model/self.mse_resid
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


ValueError: shapes (102,40) and (41,) not aligned: 40 (dim 1) != 41 (dim 0)

In [99]:
X_test_sm = sm.add_constant(X_test)

In [100]:
X_test_sm.shape

(102, 40)

In [85]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 0 to 713
Data columns (total 40 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   Year                               714 non-null    int64
 1   Quarter                            714 non-null    int64
 2   Real_GDP                           714 non-null    int64
 3   Total_Population                   714 non-null    int64
 4   Total_Male_Population%             714 non-null    int64
 5   Total_Female_Population%           714 non-null    int64
 6   Population_Under5%                 714 non-null    int64
 7   Population5_17%                    714 non-null    int64
 8   Population18_24%                   714 non-null    int64
 9   Population25_34%                   714 non-null    int64
 10  Population35_44%                   714 non-null    int64
 11  Population45_54%                   714 non-null    int64
 12  Population55_64%           

In [87]:
X_test_sm.info()

<class 'pandas.core.frame.DataFrame'>
Index: 102 entries, 714 to 815
Data columns (total 40 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   Year                               102 non-null    int64
 1   Quarter                            102 non-null    int64
 2   Real_GDP                           102 non-null    int64
 3   Total_Population                   102 non-null    int64
 4   Total_Male_Population%             102 non-null    int64
 5   Total_Female_Population%           102 non-null    int64
 6   Population_Under5%                 102 non-null    int64
 7   Population5_17%                    102 non-null    int64
 8   Population18_24%                   102 non-null    int64
 9   Population25_34%                   102 non-null    int64
 10  Population35_44%                   102 non-null    int64
 11  Population45_54%                   102 non-null    int64
 12  Population55_64%         