In [45]:
# import of packages
from proj03 import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.weightstats import ttest_ind
from scipy import stats

# autoreload for easier debugging
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
# import data
baseline = pd.read_stata('data/baseline.dta')
bok_inflation = pd.read_stata('data/BOK_inflation.dta')
cleanpricedata_y1y2 = pd.read_stata('data/cleanPriceData_Y1Y2.dta')
intensity_obs_short = pd.read_stata('data/intensity_obs_short.dta')
lrfu_select_dataset = pd.read_stata('data/LRFU_select_dataset.dta')
ms1ms2_pooled = pd.read_stata('data/MS1MS2_pooled.dta')
repayment_datay1 = pd.read_stata('data/repayment_dataY1.dta')

In [47]:
# clean ms1ms2_pooled (drop if MS !=2, keep columns oafid and treatMS1MS2, group by oafid and take mean and rename) 
ms1ms2_pooled_clean = ms1ms2_pooled[ms1ms2_pooled['MS']==2]
ms1ms2_pooled_clean = ms1ms2_pooled_clean[['oafid', 'treatMS1MS2']]
ms1ms2_pooled_clean = ms1ms2_pooled_clean.groupby('oafid', as_index=False).mean()
ms1ms2_pooled_clean.rename(columns={'treatMS1MS2': 'treat13'}, inplace=True)
print(ms1ms2_pooled_clean.shape[0])
print(ms1ms2_pooled_clean.shape[0])


1019
1019


In [48]:
# clean baseline data (we assume they have mad an)
baseline['delta_base'] # is this already 1 - delta_base or why is it not named 'delta' but 'delta_base'??

0       0.042857
1       0.214286
2       0.142857
3       0.428571
4       0.357143
          ...   
1811    0.142857
1812    0.171429
1813    0.142857
1814    0.000000
1815    0.485714
Name: delta_base, Length: 1816, dtype: float32

In [59]:
# clean baseline data (the stata code indicates that the variables columns 'businessprofitmonth' and 'delta' should be kept, however they have already been renamed to 'businessprofitmonth_base' and 'delta_base')
base_cols = ['oafid', 'logtotcons_base', 'male', 'num_adults', 'num_schoolchildren', 'finished_primary',
                   'finished_secondary', 'cropland', 'num_rooms', 'schoolfees', 'totcons_base', 'logpercapcons_base',
                   'total_cash_savings_base', 'total_cash_savings_trimmed', 'has_savings_acct', 'taken_bank_loan',
                   'taken_informal_loan', 'liquidWealth', 'wagepay', 'businessprofitmonth_base', 'price_avg_diff_pct',
                   'price_expect_diff_pct', 'harvest2011', 'netrevenue2011', 'netseller2011', 'autarkic2011',
                   'maizelostpct2011', 'harvest2012', 'correct_interest', 'digit_recall', 'maizegiver', 'delta_base', 'treatment']
baseline_clean = baseline[base_cols].copy()

""" WE SURE ABOUT THIS? """
# we assume that the variable 'delta_base' does not already have the correct value thus the following is not commented out
baseline['delta_base'] = 1 - baseline['delta_base']

# rename columns
baseline_clean.columns = [col + '_base' if not col.endswith('_base') and col != 'oafid' and col != 'treatment' else col for col in baseline_clean.columns]
baseline_clean.rename(columns={'treatment': 'treatment2012'}, inplace=True)

# generate treat12 as bool for treatment and control in 2012
baseline_clean['treat12'] = baseline_clean['treatment2012'].apply(lambda x: x in ['T1', 'T2'])
baseline_clean.loc[baseline_clean['treatment2012'] == '', 'treat12'] = np.nan


  baseline_clean.loc[baseline_clean['treatment2012'] == '', 'treat12'] = np.nan


In [60]:
# merge baseline_clean and ms1ms2_pooled_clean on oafid
base_ms1ms2_pool = pd.merge(baseline_clean, ms1ms2_pooled_clean, on='oafid', how='outer', indicator=True)

# Drop rows that are only in the using dataset (equivalent to 'merge_base == 2' in Stata)
base_ms1ms2_pool = base_ms1ms2_pool[base_ms1ms2_pool['_merge'] != 'right_only']

# Generate bool in_sample_Y2
base_ms1ms2_pool['in_sample_Y2'] = (base_ms1ms2_pool['_merge'] == 'both')

""" WHY DO THIS – RIGHT_ONLY IS REMOVED SO NO DATA??? """
# Generate bool newin13
base_ms1ms2_pool['newin13'] = (base_ms1ms2_pool['_merge'] == 'right_only')

# Generate bool attrit13
base_ms1ms2_pool['attrit13'] = (base_ms1ms2_pool['_merge'] == 'left_only')

base_ms1ms2_pool.drop(columns=['_merge'], inplace=True)


In [61]:
base_ms1ms2_pool['schoolfees_base'] = base_ms1ms2_pool['schoolfees_base']*1000


# Variables list
vars_list = [
    "male_base", "num_adults_base", "num_schoolchildren_base", "finished_primary_base",
    "finished_secondary_base", "cropland_base", "num_rooms_base", "schoolfees_base",
    "totcons_base", "logpercapcons_base", "total_cash_savings_base",
    "total_cash_savings_trimmed_base", "has_savings_acct_base", "taken_bank_loan_base",
    "taken_informal_loan_base", "liquidWealth_base", "wagepay_base",
    "businessprofitmonth_base", "price_avg_diff_pct_base",
    "price_expect_diff_pct_base", "harvest2011_base", "netrevenue2011_base",
    "netseller2011_base", "autarkic2011_base", "maizelostpct2011_base",
    "harvest2012_base", "correct_interest_base", "digit_recall_base",
    "maizegiver_base"
]

""" AGAIN SHOULD NOT BE NEEDED AS RIGHT_ONLY IS REMOVED """
# Filter the DataFrame
df_filtered = base_ms1ms2_pool[base_ms1ms2_pool['newin13'] != 1]

# Function to perform t-tests
def t_test_by_group(df, var, group_var='treat12'):
    group1 = df[df[group_var] == 0][var].dropna()
    group2 = df[df[group_var] == 1][var].dropna()
    t_stat, p_val = stats.ttest_ind(group1, group2, equal_var=False)
    return group1.mean(), group2.mean(), len(group1) + len(group2), t_stat, p_val

# Applying t-tests and collecting results
results = []
for var in vars_list:
    try:
        control_mean, treat_mean, obs, t_stat, p_val = t_test_by_group(df_filtered, var)
        std_diff = (treat_mean - control_mean) / np.sqrt(((len(df_filtered[df_filtered['treat12'] == 0][var].dropna()) - 1) * np.std(df_filtered[df_filtered['treat12'] == 0][var].dropna(), ddof=1) ** 2 + (len(df_filtered[df_filtered['treat12'] == 1][var].dropna()) - 1) * np.std(df_filtered[df_filtered['treat12'] == 1][var].dropna(), ddof=1) ** 2) / (len(df_filtered[df_filtered['treat12'] == 0][var].dropna()) + len(df_filtered[df_filtered['treat12'] == 1][var].dropna()) - 2))
        results.append([var, treat_mean, control_mean, obs, std_diff, p_val])
    except:
        results.append([var, np.nan, np.nan, np.nan, np.nan, np.nan])

# Convert results to DataFrame for easier LaTeX conversion
results_df = pd.DataFrame(results, columns=['Variable', 'Treat Mean', 'Control Mean', 'Observations', 'Std Diff', 'P-value'])

# Convert schoolfees_base back to original scale
base_ms1ms2_pool['schoolfees_base'] = base_ms1ms2_pool['schoolfees_base']/1000

latex_output = results_df.to_latex(index=False, float_format="%.3f")
print(latex_output)



\begin{tabular}{lrrrrr}
\toprule
Variable & Treat Mean & Control Mean & Observations & Std Diff & P-value \\
\midrule
male_base & 0.296 & 0.334 & 1589 & -0.083 & 0.109 \\
num_adults_base & 3.004 & 3.196 & 1510 & -0.099 & 0.067 \\
num_schoolchildren_base & 2.998 & 3.072 & 1589 & -0.038 & 0.454 \\
finished_primary_base & 0.718 & 0.772 & 1490 & -0.122 & 0.019 \\
finished_secondary_base & 0.253 & 0.270 & 1490 & -0.039 & 0.460 \\
cropland_base & 2.441 & 2.398 & 1512 & 0.014 & 0.796 \\
num_rooms_base & 3.073 & 3.252 & 1511 & -0.072 & 0.219 \\
schoolfees_base & 27239.693 & 29813.631 & 1589 & -0.068 & 0.191 \\
totcons_base & 14970.862 & 15371.378 & 1437 & -0.032 & 0.550 \\
logpercapcons_base & 7.975 & 7.963 & 1434 & 0.019 & 0.721 \\
total_cash_savings_base & 5157.396 & 8021.499 & 1572 & -0.128 & 0.028 \\
total_cash_savings_trimmed_base & 4731.623 & 5389.836 & 1572 & -0.050 & 0.343 \\
has_savings_acct_base & 0.419 & 0.425 & 1589 & -0.012 & 0.815 \\
taken_bank_loan_base & 0.079 & 0.083 & 1589 & 

In [57]:
baseline_clean['totcons_base_base']

0        9429.213867
1        5113.437500
2       36049.988281
3        3791.185547
4       12032.489258
            ...     
1811             NaN
1812    26982.691406
1813    13421.428711
1814    13213.939453
1815    22471.937500
Name: totcons_base_base, Length: 1816, dtype: float32