In [1]:
!pip install pandas
!pip install statsmodels pandas
!pip install python-docx

Collecting numpy>=1.17
  Downloading numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl (19.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.6.0 requires daal==2021.4.0, which is not installed.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.24.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.24.4
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m10.3 MB/s[0m eta [36

In [3]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from docx import Document

In [15]:
# Read the CSV file into a DataFrame
data = pd.read_csv('/Users/nguyenthao/Documents/UNI_MATERIALS/PRL/Replication/DynamicsDiscrimination/Replication_package/DynamicsDiscrimination_new.csv')

# Display the first few rows of the DataFrame
print(data.head())

# Create the interaction term
data['interaction_answ'] = data['Question'] * data['Gender']

# Initialize a DataFrame to store results
results = pd.DataFrame(columns=['Regression', 'Variable', 'Coefficient', 'Standard Error'])

# Function to add model results to DataFrame
def add_model_results(model, model_name):
    for param, coef in model.params.items():
        se = model.bse[param]
        results.loc[len(results)] = [model_name, param, coef, se]

# Perform the regressions and collect results
model1 = smf.ols('ChangeRep ~ Gender', data=data[data['Question'] == 0]).fit()
add_model_results(model1, 'ChangeRep ~ Gender (Question==0)')

model2 = smf.ols('Updown ~ Gender', data=data[data['Question'] == 0]).fit()
add_model_results(model2, 'Updown ~ Gender (Question==0)')

model3 = smf.ols('ChangeRep ~ Gender', data=data[(data['Question'] == 1) & (data['HighRep'] == 0)]).fit()
add_model_results(model3, 'ChangeRep ~ Gender (Question==1 & HighRep==0)')

model4 = smf.ols('Updown ~ Gender', data=data[(data['Question'] == 1) & (data['HighRep'] == 0)]).fit()
add_model_results(model4, 'Updown ~ Gender (Question==1 & HighRep==0)')

model5 = smf.ols('ChangeRep ~ Gender + Question + interaction_answ', data=data[data['HighRep'] == 0]).fit()
add_model_results(model5, 'ChangeRep ~ Gender + Question + interaction_answ (HighRep==0)')

model6 = smf.ols('Updown ~ Gender + Question + interaction_answ', data=data[data['HighRep'] == 0]).fit()
add_model_results(model6, 'Updown ~ Gender + Question + interaction_answ (HighRep==0)')

# Export results to CSV
results.to_csv('regression_results.csv', index=False)

# Create a nested table in Word
doc = Document()
doc.add_heading('Regression Results', level=1)

# Create a list of unique variables
variables = results['Variable'].unique()

# Create a table with one column for each regression
num_regressions = results['Regression'].nunique()
table = doc.add_table(rows=len(variables) + 1, cols=num_regressions)
table.style = 'Table Grid'

# Add headers for regressions
for i, regression in enumerate(results['Regression'].unique()):
    table.cell(0, i).text = regression

# Fill in variable names and coefficients with SE
for i, variable in enumerate(variables):
    table.cell(i + 1, 0).text = variable  # Fill variable names in the first column
    for j, regression in enumerate(results['Regression'].unique()):
        row = results[(results['Variable'] == variable) & (results['Regression'] == regression)]
        if not row.empty:
            coef = row['Coefficient'].values[0]
            se = row['Standard Error'].values[0]
            table.cell(i + 1, j).text = f"{coef:.4f}\n({se:.4f})"  # Coefficient and SE

# Save the Word document
doc.save('Table1.docx')

# Display the results in the console
print(results)

  upvotes Downvotes  Updown  ChangeRep  HighRep  Gender  Question  Unnamed: 7  \
0       1         0     1.0        5.0      1.0     1.0       1.0         NaN   
1       5         0     5.0       25.0      1.0     0.0       1.0         NaN   
2       0         0     0.0        0.0      1.0     0.0       1.0         NaN   
3       1         0     1.0        5.0      1.0     1.0       1.0         NaN   
4       0         0     0.0        0.0      1.0     1.0       1.0         NaN   

   Unnamed: 8  Unnamed: 9                              Unnamed: 10  
0         NaN         NaN                                      NaN  
1         NaN         NaN                     Variable Definitions  
2         NaN         NaN                                      NaN  
3         NaN         NaN      Upvotes = Number of upvotes on post  
4         NaN         NaN  Downvotes = Number of downvotes on post  
                                           Regression          Variable  \
0                    Cha

In [16]:
# Create the interaction term
data['interaction_rep'] = data['HighRep'] * data['Gender']

# Convert 'upvotes' to numeric, setting errors='coerce' will replace non-numeric with NaN
data['upvotes'] = pd.to_numeric(data['upvotes'], errors='coerce')

# Create "binary" variable
data['binary'] = 0

# Update 'binary' to 1 where 'upvotes' is greater than 0
data.loc[data['upvotes'] > 0, 'binary'] = 1

# Initialize a DataFrame to store results
results = pd.DataFrame(columns=['Regression', 'Variable', 'Coefficient', 'Standard Error'])

# Function to add model results to DataFrame
def add_model_results(model, model_name):
    for param, coef in model.params.items():
        se = model.bse[param]
        results.loc[len(results)] = [model_name, param, coef, se]

# Perform the regressions and collect results
model1 = smf.ols('ChangeRep ~ Gender', data=data[(data['Question'] == 1) & (data['HighRep'] == 1)]).fit()
add_model_results(model1, 'ChangeRep ~ Gender (Question==1 & HighRep==1)')

model2 = smf.ols('Updown ~ Gender', data=data[(data['Question'] == 1) & (data['HighRep'] == 1)]).fit()
add_model_results(model2, 'Updown ~ Gender (Question==1 & HighRep==1)')

model3 = smf.ols('ChangeRep ~ Gender + HighRep + interaction_rep', data=data[data['Question'] == 1]).fit()
add_model_results(model3, 'ChangeRep ~ Gender + HighRep + interaction_rep (Question==1)')

model4 = smf.ols('Updown ~ Gender + HighRep + interaction_rep', data=data[data['Question'] == 1]).fit()
add_model_results(model4, 'Updown ~ Gender + HighRep + interaction_rep (Question==1)')

model5 = smf.ols('binary ~ Gender + HighRep + interaction_rep', data=data[data['Question'] == 1]).fit()
add_model_results(model5, 'binary ~ Gender + HighRep + interaction_rep (Question==1)')

# Export results to CSV
results.to_csv('regression_results_Table2.csv', index=False)

# Create a nested table in Word
doc = Document()
doc.add_heading('Regression Results', level=1)

# Create a list of unique variables
variables = results['Variable'].unique()

# Create a table with one column for each regression
num_regressions = results['Regression'].nunique()
table = doc.add_table(rows=len(variables) + 1, cols=num_regressions)
table.style = 'Table Grid'

# Add headers for regressions
for i, regression in enumerate(results['Regression'].unique()):
    table.cell(0, i).text = regression

# Fill in variable names and coefficients with SE
for i, variable in enumerate(variables):
    table.cell(i + 1, 0).text = variable  # Fill variable names in the first column
    for j, regression in enumerate(results['Regression'].unique()):
        row = results[(results['Variable'] == variable) & (results['Regression'] == regression)]
        if not row.empty:
            coef = row['Coefficient'].values[0]
            se = row['Standard Error'].values[0]
            table.cell(i + 1, j).text = f"{coef:.4f}\n({se:.4f})"  # Coefficient and SE

# Save the Word document
doc.save('Table2.docx')

# Display the results in the console
print(results)

                                           Regression         Variable  \
0       ChangeRep ~ Gender (Question==1 & HighRep==1)        Intercept   
1       ChangeRep ~ Gender (Question==1 & HighRep==1)           Gender   
2          Updown ~ Gender (Question==1 & HighRep==1)        Intercept   
3          Updown ~ Gender (Question==1 & HighRep==1)           Gender   
4   ChangeRep ~ Gender + HighRep + interaction_rep...        Intercept   
5   ChangeRep ~ Gender + HighRep + interaction_rep...           Gender   
6   ChangeRep ~ Gender + HighRep + interaction_rep...          HighRep   
7   ChangeRep ~ Gender + HighRep + interaction_rep...  interaction_rep   
8   Updown ~ Gender + HighRep + interaction_rep (Q...        Intercept   
9   Updown ~ Gender + HighRep + interaction_rep (Q...           Gender   
10  Updown ~ Gender + HighRep + interaction_rep (Q...          HighRep   
11  Updown ~ Gender + HighRep + interaction_rep (Q...  interaction_rep   
12  binary ~ Gender + HighRep + intera