In [30]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error

In [9]:
os.chdir('/Users/ericwang/Desktop/ECON_148/HW2')
data = pd.read_csv("current_dataHW2.csv")

In [38]:
# Formattting data, removes -99 and -88 for RECVDVACC
data=data[data['RECVDVACC']>=0]
data['age'] = 2023-data['TBIRTH_YEAR']
X = data[['age', 'EGENDER', 'EEDUC']] # age, gender, education
y = data['RECVDVACC']
X_reg=pd.get_dummies(X, columns=['EGENDER','EEDUC'], drop_first=True) # create dummy variables

# a.)
### Linear Regression

In [39]:
linear_model = LinearRegression().fit(X_reg, y)
linear_pred = linear_model.predict(X_reg)
linear_r2 = r2_score(y, linear_pred)
print(f"Linear Regression R-squared: ", linear_r2)

Linear Regression R-squared:  0.1453256339137905


### Logistic Regression

In [40]:
logistic_model = LogisticRegression(solver='newton-cg').fit(X_reg, y)
logistic_pred = logistic_model.predict(X_reg)
logistic_acc = accuracy_score(y, logistic_pred)
print(f"Logistic Regression Accuracy: ", logistic_acc)

Logistic Regression Accuracy:  0.6650778519224658


#### Logistic regression significantly performs better than linear regression in this case

## b.)
### Linear Model
#### Variables: 
##### REGION: Political stances correlate with region and location; More republican areas are more likely to be unvaccinated
##### EXPNS_DIF: Financial dificulties may correlate with whether or not individuals are able to afford getting sick, i.e. it would be more costly for individuals who struggle financially to not get the vaccine.
##### WORRY: Directly related to how cautious a person is regarding anything and everything
##### DELAY: Medical treatment delays may impact decision making of individuals which can influence the percieved cost to recieve the vaccine
##### INTRNTAVAIL: Directly influences the individual's decision making

In [43]:
data = pd.read_csv("current_dataHW2.csv")
# removes all -88 and -99 values
data = data[data['REGION']>=0]
data = data[data['EXPNS_DIF']>=0]
data = data[data['WORRY']>=0]
data = data[data['DELAY']>=0]
data = data[data['INTRNTAVAIL']>=0]
data = data[data['RECVDVACC']>=0]

X = data[['REGION', 'EXPNS_DIF', 'WORRY', 'DELAY', 'INTRNTAVAIL']]
y = data['RECVDVACC']
X_reg=pd.get_dummies(X, columns=['EXPNS_DIF', 'WORRY', 'DELAY', 'INTRNTAVAIL'], drop_first=True) # create dummy variables

In [44]:
linear_model = LinearRegression().fit(X_reg, y)
linear_pred = linear_model.predict(X_reg)
linear_r2 = r2_score(y, linear_pred)
print(f"Linear Regression R-squared: ", linear_r2)
linear_model.coef_

Linear Regression R-squared:  0.019928109159753182


array([-0.00983677,  0.05240657,  0.0928349 ,  0.17448662, -0.03575474,
       -0.01218119,  0.04330587,  0.00986447,  0.02992434,  0.05499585,
        0.21399649,  0.06201616])

In [45]:
# convert the prediction from the linear model to either 1 or 2.
for i in range(len(linear_pred)):
    if linear_pred[i] <1.5:
        linear_pred[i] = 1
    else:
        linear_pred[i] = 2
# compute the accuracy of the prediction from linear model (after converting it to 1 or 2)
Accuracy_linear=1-np.sum(np.abs(y-linear_pred))/len(y)
print(Accuracy_linear)


0.5529485049833887


Shows positive correlation except for b0_REGION, b4_EXPNS_DIF, b5_WORRY
b0 kind of makes sense because out of region shouldn't have any correlation with vaccination
b4 and b5 both makes sense because no expns_dif and no worry should mean that they're not worried about vaccination

## c.)
##### WRKLOSS: Indicator for disposable income
##### EXPNS_DIF: Indicator for disposable income
##### LIVQTR: Type of housing indicates income level

In [45]:
data = pd.read_csv("current_dataHW2.csv")
# limit to california
data = data[data['EST_ST'] == 6]

# removes all -88 and -99 values
data = data[data['WRKLOSS']>=0]
data = data[data['EXPNS_DIF']>=0]
data = data[data['LIVQTR']>=0]
data = data[data['TSPNDPRPD']>=0]
data = data[data['TSPNDFOOD']>=0]

data['food_expns'] = data['TSPNDFOOD'] + data['TSPNDPRPD']

X = data[['WRKLOSS', 'EXPNS_DIF', 'LIVQTR']]
y = data['food_expns']

X_reg=pd.get_dummies(X, columns=['EXPNS_DIF','LIVQTR'], drop_first=True) # create dummy variables. 
train_x, test_x, train_y, test_y = train_test_split(X_reg, y, test_size=0.2, random_state = 42)

In [46]:
# normalize the data (both training and test)
Xtrain_scaled=train_x.copy()
for i in Xtrain_scaled.columns:
    Xtrain_scaled[i] = (Xtrain_scaled[i]-Xtrain_scaled[i].min())/(Xtrain_scaled[i].max()-Xtrain_scaled[i].min())

Xtest_scaled=test_x.copy()
for i in Xtest_scaled.columns:
    Xtest_scaled[i] = (Xtest_scaled[i]-Xtest_scaled[i].min())/(Xtest_scaled[i].max()-Xtest_scaled[i].min())

In [48]:
# linear model
reg1 = LinearRegression().fit(Xtrain_scaled, train_y) # train a linear model
y_pred_linear=reg1.predict(Xtest_scaled) # predict using the linear model on test data
Accuracy_linear_full=np.sum(np.square(y_pred_linear-test_y))/len(test_y) # evaluate MSE on test data
Accuracy_linear_full, reg1.score(test_x, test_y)
reg1.score(test_x, test_y)

0.010207205115030482

In [49]:
# Lasso
Lasso_model=LassoCV(cv=5, random_state=0).fit(Xtrain_scaled, train_y) # train Lasso with cross validation
y_pred_lasso=Lasso_model.predict(Xtest_scaled)
Accuracy_linear_lasso=np.sum(np.square(y_pred_lasso-test_y))/len(test_y)
Accuracy_linear_lasso, Lasso_model.score(Xtest_scaled, test_y)
Lasso_model.score(test_x, test_y)

0.009396890633543542

In [None]:
# Ridge 
Ridge_model = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(Xtrain_scaled, train_y) # train ridge with cross validation
y_pred_ridge=Ridge_model.predict(Xtest_scaled)
Accuracy_linear_ridge=np.sum(np.square(y_pred_ridge-test_y))/len(test_y)
Accuracy_linear_ridge, Ridge_model.score(test_x, test_y)
Ridge_model.score(test_x, test_y)

In [None]:
# Elastic net
EN_model = ElasticNetCV(cv=5, random_state=2).fit(Xtrain_scaled, train_y) # train elastic net with cross validation
y_pred_EN=EN_model.predict(Xtest_scaled)
Accuracy_linear_EN=np.sum(np.square(y_pred_EN-test_y))/len(test_y)
Accuracy_linear_EN, EN_model.score(test_x, test_y)
EN_model.score(test_x, test_y)

##### EN model appears to have the best performance of R2 = 0.0117

## d.)

In [None]:
future = pd.read_csv("future_dataHW2.csv")

future = future[future['WRKLOSS']>=0]
future = future[future['EXPNS_DIF']>=0]
future = future[future['LIVQTR']>=0]
future = future[future['TSPNDPRPD']>=0]

future['food_expns'] = future['TSPNDPRPD']

future_X = future[['WRKLOSS', 'EXPNS_DIF', 'LIVQTR']]
future_y = future['food_expns']

future_X_reg = pd.get_dummies(future_X, columns=['EXPNS_DIF','LIVQTR'], drop_first=True) # create dummy variables.

In [None]:
# EN
EN_model = ElasticNetCV(cv=5, random_state=0).fit(future_X_reg, future_y) # train elastic net with cross validation
y_pred_EN=EN_model.predict(future_X_reg)

# save to csv file
# np.savetxt('/Users/ericwang/Desktop/ECON_148/HW2/prediction.csv',y_pred_EN, delimiter=',')