In [2]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances
from causalml.match import NearestNeighborMatch
from causalml.dataset import *

In [3]:
df_Trump = pd.read_stata('trumpingnorms_clean_Low.dta')

In [4]:
df_Trump = df_Trump[df_Trump['period'] >= 5]

In [5]:
df_Trump = df_Trump[df_Trump['period'] <= 8]

In [6]:
df_Trump.dropna(subset=['hardcommit_avg', 'age', 'nonwhite'], inplace=True)

In [7]:
# Step 1: Fit a logistic regression model to estimate propensity scores
X = df_Trump[['age', 'nonwhite', 'liberal', 'UScitizen', 'nativespeaker', 'employed', 'female', 'partner_female', 'genrevealed']]
y = df_Trump['preelection']
propensity_model = LogisticRegression()
propensity_model.fit(X, y)

# Step 2: Calculate propensity scores
propensity_scores = propensity_model.predict_proba(X)[:, 1]

# Step 3: Match the data
treated_indices = df_Trump.index[df_Trump['preelection'] == 1]
control_indices = df_Trump.index[df_Trump['preelection'] == 0]

# Use nearest neighbor matching
nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn.fit(X.loc[control_indices])
distances, indices = nn.kneighbors(X.loc[treated_indices])
matched_control_indices = control_indices[indices.flatten()]

# Step 4: Assess balance and count match frequency
matched_data = df_Trump.loc[np.concatenate([treated_indices, matched_control_indices])]
match_frequency = pd.Series(matched_control_indices).value_counts()

In [8]:
propensity_scores

array([0.77160944, 0.77160944, 0.77160944, ..., 0.65291522, 0.66396267,
       0.66396267])

In [9]:
list_weights = []
#df_Trump.to_excel('test1.xlsx')
for i in df_Trump.index:
    try:
        list_weights.append(match_frequency[i])
    except:
        list_weights.append(0)

#df_Trump.index

In [10]:
df_Trump['_weights'] = list_weights

In [11]:
treatment_cols = ['preelection', 'age', 'nonwhite', 'liberal', 'UScitizen', 'nativespeaker', 'employed', 'female', 'partner_female', 'genrevealed']
X = df_Trump[treatment_cols]
y = df_Trump['hardcommit_avg']  # Assuming there's a treatment indicator variable

In [12]:
df_Trump['session_id'] = df_Trump.groupby('session').ngroup()

In [13]:
df_Trump.dropna(subset=['hardcommit_avg', 'age', 'nonwhite'], inplace=True)

In [14]:
treatment = 'preelection'
control_variables = ['hardcommit_avg','age', 'nonwhite', 'liberal', 'UScitizen', 'nativespeaker', 'employed', 'female', 'partner_female', 'genrevealed']

In [15]:
matcher = NearestNeighborMatch(replace=True, ratio=1)

In [16]:
matched_data = matcher.match(data=df_Trump, treatment_col= treatment, score_cols=control_variables)

In [17]:
model_C = smf.ols(formula='hardcommit_avg ~ postelection + age + nonwhite + liberal + UScitizen + nativespeaker + employed + female + partner_female + genrevealed + day + withindaytrend + period', data=df_Trump)
results_C = model_C.fit(cov_type='cluster', cov_kwds={'groups': df_Trump['session_id']})

# Print summary of results
print(results_C.summary())

                            OLS Regression Results                            
Dep. Variable:         hardcommit_avg   R-squared:                       0.071
Model:                            OLS   Adj. R-squared:                  0.062
Method:                 Least Squares   F-statistic:                     4.727
Date:                Fri, 24 May 2024   Prob (F-statistic):           0.000168
Time:                        01:53:03   Log-Likelihood:                -440.19
No. Observations:                1388   AIC:                             908.4
Df Residuals:                    1374   BIC:                             981.7
Df Model:                          13                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.1628      0.122      1.

In [18]:
# Regression model with weighting
model_D = smf.wls(formula='hardcommit_avg ~ postelection + age + nonwhite + liberal + UScitizen + nativespeaker + employed + female + partner_female + genrevealed + day + withindaytrend + period', data=df_Trump, weights=df_Trump['_weights'])
results_D = model_D.fit(cov_type='cluster', cov_kwds={'groups': df_Trump['session_id']})

# Print summary of results
print(results_D.summary())

                            WLS Regression Results                            
Dep. Variable:         hardcommit_avg   R-squared:                       0.160
Model:                            WLS   Adj. R-squared:                  0.152
Method:                 Least Squares   F-statistic:                     137.9
Date:                Fri, 24 May 2024   Prob (F-statistic):           1.02e-23
Time:                        01:53:03   Log-Likelihood:                   -inf
No. Observations:                1388   AIC:                               inf
Df Residuals:                    1375   BIC:                               inf
Df Model:                          12                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.1000      0.210      0.

In [19]:
df_Trump = df_Trump[df_Trump['female'] == 0]

In [20]:
df_Trump = df_Trump[df_Trump['partner_female'] == 1]

In [21]:
df_Trump = df_Trump[df_Trump['genrevealed'] == 1]

In [22]:
model_G = smf.ols(formula='hardcommit_avg ~ postelection + age + nonwhite + liberal + UScitizen + nativespeaker + employed + day + withindaytrend + period', data=df_Trump)
results_G = model_G.fit(cov_type='cluster', cov_kwds={'groups': df_Trump['session_id']})

# Print summary of results
print(results_G.summary())

                            OLS Regression Results                            
Dep. Variable:         hardcommit_avg   R-squared:                       0.145
Model:                            OLS   Adj. R-squared:                  0.104
Method:                 Least Squares   F-statistic:                     18.96
Date:                Fri, 24 May 2024   Prob (F-statistic):           5.01e-07
Time:                        01:53:03   Log-Likelihood:                -53.925
No. Observations:                 221   AIC:                             129.8
Df Residuals:                     210   BIC:                             167.2
Df Model:                          10                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0166      0.226      0.

In [23]:
treatment = 'preelection'
control_variables = ['hardcommit_avg','age', 'nonwhite', 'liberal', 'UScitizen', 'nativespeaker', 'employed']

In [24]:
matcher = NearestNeighborMatch(replace=True, ratio=1)

In [25]:
matched_data = matcher.match(data=df_Trump, treatment_col= treatment, score_cols=control_variables)

In [26]:
# Regression model with weighting
model_H = smf.wls(formula='hardcommit_avg ~ postelection + age + nonwhite + liberal + UScitizen + nativespeaker + employed + female + partner_female + genrevealed + day + withindaytrend + period', data=df_Trump, weights=df_Trump['_weights'])
results_H = model_H.fit(cov_type='cluster', cov_kwds={'groups': df_Trump['session_id']})

# Print summary of results
print(results_H.summary())

                            WLS Regression Results                            
Dep. Variable:         hardcommit_avg   R-squared:                       0.392
Model:                            WLS   Adj. R-squared:                  0.366
Method:                 Least Squares   F-statistic:                 1.999e+12
Date:                Fri, 24 May 2024   Prob (F-statistic):           4.51e-94
Time:                        01:53:03   Log-Likelihood:                   -inf
No. Observations:                 221   AIC:                               inf
Df Residuals:                     211   BIC:                               inf
Df Model:                           9                                         
Covariance Type:              cluster                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -0.5033      0.349     -1.