# Load necessary packages

In [1]:
# import of packages
from proj03 import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.weightstats import ttest_ind
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

# autoreload for easier debugging
%load_ext autoreload
%autoreload 2

# Import raw data

In [2]:
# import data
baseline = pd.read_stata('data/baseline.dta')
bok_inflation = pd.read_stata('data/BOK_inflation.dta')
cleanpricedata_y1y2 = pd.read_stata('data/cleanPriceData_Y1Y2.dta')
intensity_obs_short = pd.read_stata('data/intensity_obs_short.dta')
lrfu_select_dataset = pd.read_stata('data/LRFU_select_dataset.dta')
ms1ms2_pooled = pd.read_stata('data/MS1MS2_pooled.dta')
repayment_datay1 = pd.read_stata('data/repayment_dataY1.dta')

# Creating the Tables

## Create Table 1

We start by cleaning the ms1ms2_pooled and baseline data.

In [3]:
# clean ms1ms2_pooled (drop if MS !=2, keep columns oafid and treatMS1MS2, group by oafid and take mean and rename) 
ms1ms2_pooled_clean = ms1ms2_pooled[ms1ms2_pooled['MS']==2]
ms1ms2_pooled_clean = ms1ms2_pooled_clean[['oafid', 'treatMS1MS2']]
print(ms1ms2_pooled_clean.shape[0])
ms1ms2_pooled_clean = ms1ms2_pooled_clean.groupby('oafid', as_index=False).mean()
ms1ms2_pooled_clean.rename(columns={'treatMS1MS2': 'treat13'}, inplace=True)
print(ms1ms2_pooled_clean.shape[0])


2993
1019


For the baseline data we note that some of the columns have already been renamed with the suffix `_base` however and thus need to account for this. We however, assume that the data have not been altered in any other way compared to what the do in the `do` file.

In [4]:
# clean baseline data (the stata code indicates that the variables columns 'businessprofitmonth' and 'delta' should be kept, however they have already been renamed to 'businessprofitmonth_base' and 'delta_base')
base_cols = ['oafid', 'logtotcons_base', 'male', 'num_adults', 'num_schoolchildren', 'finished_primary',
                   'finished_secondary', 'cropland', 'num_rooms', 'schoolfees', 'totcons_base', 'logpercapcons_base',
                   'total_cash_savings_base', 'total_cash_savings_trimmed', 'has_savings_acct', 'taken_bank_loan',
                   'taken_informal_loan', 'liquidWealth', 'wagepay', 'businessprofitmonth_base', 'price_avg_diff_pct',
                   'price_expect_diff_pct', 'harvest2011', 'netrevenue2011', 'netseller2011', 'autarkic2011',
                   'maizelostpct2011', 'harvest2012', 'correct_interest', 'digit_recall', 'maizegiver', 'delta_base', 'treatment']
baseline_clean = baseline[base_cols].copy()

""" WE SURE ABOUT THIS? """
# we assume that the variable 'delta_base' does not already have the correct value thus the following is not commented out
baseline['delta_base'] = 1 - baseline['delta_base']

# rename columns
baseline_clean.columns = [col + '_base' if not col.endswith('_base') and col != 'oafid' and col != 'treatment' else col for col in baseline_clean.columns]
baseline_clean.rename(columns={'treatment': 'treatment2012'}, inplace=True)

# generate treat12 as bool for treatment and control in 2012
baseline_clean['treat12'] = baseline_clean['treatment2012'].apply(lambda x: x in ['T1', 'T2'])
baseline_clean.loc[baseline_clean['treatment2012'] == '', 'treat12'] = np.nan


  baseline_clean.loc[baseline_clean['treatment2012'] == '', 'treat12'] = np.nan


Now we can merge the two datasets.

In [5]:
# merge baseline_clean and ms1ms2_pooled_clean on oafid
base_ms1ms2_pool = pd.merge(baseline_clean, ms1ms2_pooled_clean, on='oafid', how='left')

#  # The rest of the code is not what is done in Stata but it I believe it is not used for table 1 (and I believe it is not used for only 'in_sample_Y2' is only used for Table F.1 – the join should just be a left join)
# base_ms1ms2_pool = pd.merge(baseline_clean, ms1ms2_pooled_clean, on='oafid', how='outer', indicator=True)

# # Drop rows that are only in the using dataset (equivalent to 'merge_base == 2' in Stata)
# base_ms1ms2_pool = base_ms1ms2_pool[base_ms1ms2_pool['_merge'] != 'right_only']

# # Generate bool in_sample_Y2
# base_ms1ms2_pool['in_sample_Y2'] = (base_ms1ms2_pool['_merge'] == 'both')

# # WHY DO THIS – RIGHT_ONLY IS REMOVED SO NO DATA???
# # Generate bool newin13
# base_ms1ms2_pool['newin13'] = (base_ms1ms2_pool['_merge'] == 'right_only')

# # Generate bool attrit13
# base_ms1ms2_pool['attrit13'] = (base_ms1ms2_pool['_merge'] == 'left_only')

# base_ms1ms2_pool.drop(columns=['_merge'], inplace=True)


Lastly we can create Table 1.

In [6]:
df_tab1 = base_ms1ms2_pool.copy()
df_tab1['schoolfees_base'] = df_tab1['schoolfees_base']*1000

# var list for table 1
vars_list = [
    "male_base", "num_adults_base", "num_schoolchildren_base", "finished_primary_base",
    "finished_secondary_base", "cropland_base", "num_rooms_base", "schoolfees_base",
    "totcons_base", "logpercapcons_base", "total_cash_savings_base",
    "total_cash_savings_trimmed_base", "has_savings_acct_base", "taken_bank_loan_base",
    "taken_informal_loan_base", "liquidWealth_base", "wagepay_base",
    "businessprofitmonth_base", "price_avg_diff_pct_base",
    "price_expect_diff_pct_base", "harvest2011_base", "netrevenue2011_base",
    "netseller2011_base", "autarkic2011_base", "maizelostpct2011_base",
    "harvest2012_base", "correct_interest_base", "digit_recall_base",
    "maizegiver_base"
]

# AGAIN THE FOLLOWING IS DONE IN STATA BUT IS NOT NECESSARY (AT ALL!) SHOULD NOT BE NEEDED AS RIGHT_ONLY IS REMOVED
# Filter the DataFrame
# df_tab1 = base_ms1ms2_pool[base_ms1ms2_pool['newin13'] != True]

# Function to perform t-tests
def t_test_by_group(df, var, group_var='treat12'):
    group1 = df[df[group_var] == 0][var].dropna()
    group2 = df[df[group_var] == 1][var].dropna()
    t_stat, p_val = stats.ttest_ind(group1, group2, equal_var=False)
    return group1.mean(), group2.mean(), len(group1) + len(group2), t_stat, p_val

# Applying t-tests and collecting results
results = []
for var in vars_list:
    control_mean, treat_mean, obs, t_stat, p_val = t_test_by_group(df_tab1, var)
    std_diff = (treat_mean - control_mean) / np.sqrt(((len(df_tab1[df_tab1['treat12'] == 0][var]) - 1) * np.std(df_tab1[df_tab1['treat12'] == 0][var], ddof=1) ** 2 + (len(df_tab1[df_tab1['treat12'] == 1][var]) - 1) * np.std(df_tab1[df_tab1['treat12'] == 1][var], ddof=1) ** 2) / (len(df_tab1[df_tab1['treat12'] == 0][var]) + len(df_tab1[df_tab1['treat12'] == 1][var]) - 2))
    results.append([var, treat_mean, control_mean, obs, std_diff, p_val])

# Convert results to DataFrame for easier LaTeX conversion
results_df = pd.DataFrame(results, columns=['Variable', 'Treat Mean', 'Control Mean', 'Observations', 'Std Diff', 'P-value'])


latex_table1 = results_df.to_latex(index=False, float_format="%.3f")
print(latex_table1)



\begin{tabular}{lrrrrr}
\toprule
Variable & Treat Mean & Control Mean & Observations & Std Diff & P-value \\
\midrule
male_base & 0.296 & 0.334 & 1589 & -0.083 & 0.109 \\
num_adults_base & 3.004 & 3.196 & 1510 & -0.099 & 0.067 \\
num_schoolchildren_base & 2.998 & 3.072 & 1589 & -0.038 & 0.454 \\
finished_primary_base & 0.718 & 0.772 & 1490 & -0.122 & 0.019 \\
finished_secondary_base & 0.253 & 0.270 & 1490 & -0.039 & 0.460 \\
cropland_base & 2.441 & 2.398 & 1512 & 0.014 & 0.796 \\
num_rooms_base & 3.073 & 3.252 & 1511 & -0.072 & 0.219 \\
schoolfees_base & 27239.693 & 29813.631 & 1589 & -0.068 & 0.191 \\
totcons_base & 14970.862 & 15371.378 & 1437 & -0.032 & 0.550 \\
logpercapcons_base & 7.975 & 7.963 & 1434 & 0.019 & 0.721 \\
total_cash_savings_base & 5157.396 & 8021.499 & 1572 & -0.128 & 0.028 \\
total_cash_savings_trimmed_base & 4731.623 & 5389.836 & 1572 & -0.050 & 0.343 \\
has_savings_acct_base & 0.419 & 0.425 & 1589 & -0.012 & 0.815 \\
taken_bank_loan_base & 0.079 & 0.083 & 1589 & 

## Creating table 2, 3 and 4

In [7]:
ms1ms2_pooled_tab2 = ms1ms2_pooled.copy()

ms1ms2_pooled_tab2['strata_group'] = np.where(
    ms1ms2_pooled_tab2['MS'] == 2,
    ms1ms2_pooled_tab2['groupstrata'] + ms1ms2_pooled_tab2['strata_group'].max(),
    ms1ms2_pooled_tab2['strata_group']
)

ms1ms2_pooled_tab2['oafid'] = np.where(ms1ms2_pooled_tab2['MS'] == 2, 
                                       ms1ms2_pooled_tab2['fr_id'], 
                                       ms1ms2_pooled_tab2['oafid']
                                       )

ms1ms2_pooled_tab2 = ms1ms2_pooled_tab2.drop(columns=['purchaseval_trim', 'salesval_trim'])

for column in ['purchaseval', 'salesval', 'purchasequant', 'salesquant']:
    ms1ms2_pooled_tab2 = trim_quantiles(ms1ms2_pooled_tab2, column)
    

## Creating Table 7

In [8]:
print(ms1ms2_pooled['subloc'])


0       11.0
1       11.0
2       11.0
3       11.0
4       11.0
        ... 
8111     NaN
8112     NaN
8113     NaN
8114     NaN
8115     NaN
Name: subloc, Length: 8116, dtype: float32


In [9]:
ms1ms2_pooled_tab7 = ms1ms2_pooled.copy()
ms1ms2_pooled_tab7 = ms1ms2_pooled_tab7.dropna(subset=['subloc'])


ms1ms2_pooled_tab7['z'] = pd.NA
ms1ms2_pooled_tab7['z_hi'] = pd.NA

In [12]:
treatments = ['treat12', 'treat13', 'treatMS1MS2']
dependent_vars = ['inventory_trim', 'netrevenue_trim', 'logtotcons_trim']
formulas = {
    'treat12': 'Y1round2 + Y1round3',
    'treat13': 'Y2round2 + Y2round3',
    'treatMS1MS2': 'Y1round2 + Y1round3 + Y2round1 + Y2round2 + Y2round3'
    }

# Simulating the loop to replace variables and run regressions
for dv in dependent_vars:
    for treat in treatments:
        # Stata automatically ommist the missing values in the regression – here we have to do it manually so we copy the data and drop variables
        df = ms1ms2_pooled_tab7.copy()
        df = df.dropna(subset=[dv, treat, 'hi', 'subloc','interviewdate'])
        
        # setting treament variable
        df['z'] = df[treat] # setting z to the treatment variable
        
        # setting interaction variable
        df['z_hi'] = df[treat]*df['hi'] # setting z_hi to the interaction of the treatment hi saturation
        
        formula = f'{dv} ~ z + hi + z_hi + interviewdate + {formulas[treat]}'
        # formulate the regression formula depending on the treatment
        # if treat == 'treat12':
        #     formula = f"{dv} ~ z + hi + z_hi + interviewdate + Y1round2 + Y1round3"
        # elif treat == 'treat13':
        #     formula = f"{dv} ~ z + hi + z_hi + interviewdate + Y2round2 + Y2round3"
        # else: 
        #     formula = f"{dv} ~ z + hi + z_hi + interviewdate + Y1round2 + Y1round3 + Y2round1 + Y2round2 + Y2round3"

        # Run the regression
        model = smf.ols(formula, data=df).fit(cov_type='cluster', cov_kwds={'groups': df['subloc']})
        print(model.summary())

        # t_test hypothesis
        # hypothesis = 'z + z_hi = 0'
        # print(model.t_test(hypothesis))



                            OLS Regression Results                            
Dep. Variable:         inventory_trim   R-squared:                       0.346
Model:                            OLS   Adj. R-squared:                  0.345
Method:                 Least Squares   F-statistic:                     369.6
Date:                Sun, 14 Apr 2024   Prob (F-statistic):           3.08e-16
Time:                        14:25:29   Log-Likelihood:                -9672.6
No. Observations:                3836   AIC:                         1.936e+04
Df Residuals:                    3829   BIC:                         1.940e+04
Df Model:                           6                                         
Covariance Type:              cluster                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept       183.3689    114.725      1.598