```{r}
# input Stata file
library(foreign)
library(survey)
library(reticulate)
options(rstudio.python.installationPath = "/Users/nathanielforde/mambaforge/envs")

knitr::knit_engines$set(python = reticulate::eng_python)
```

```{r}
library(SDAResources)
data(agstrat)
popsize_recode <- c('NC' = 1054, 'NE' = 220, 'S' = 1382, 'W' = 422)
agstrat$popsize <- popsize_recode[agstrat$region]
write.csv(agstrat,'agstrat_py.csv')

head(agstrat)
```

```{r}

### Draw a boxplot of the stratified random sample
boxplot(acres92/10^6 ~ region, xlab = "Region", ylab = "Millions of Acres",
        data = agstrat)
```

```{r}

## Specify Simple Random Sample Design
dstr <- svydesign(id = ~1, strata = ~region, weights = ~strwt, 
                  data = agstrat) 
summary(dstr) 

```


### Survey Summary Statistics



```{r}
smean<-svymean(~acres92, dstr)
smean

```

```{r}

stotal<-svytotal(~acres92, dstr)
stotal

```

```{r}
svar = sqrt(svyvar(~acres92, design = dstr, na.rm=T))
svar

```

```{r}

svyby(~acres92, by=~region, dstr, svymean, keep.var = TRUE)

```

```{r}
svyby(~acres92, by=~region, dstr, svyvar, keep.var = TRUE)

```

## Replicate Means and Standard Deviations in Python


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
pd.options.display.float_format = '{:.2f}'.format
agstrat_py = pd.read_csv('agstrat_py.csv')


def weighted_average(y,weights,axis=0):
    num = np.nansum(y*weights,axis=axis)
    denom =((~np.isnan(y))*weights).sum(axis=axis)
    return num / denom

def weighted_total(y, weights):
  return np.sum(y*weights)

def weighted_var(y, weights):
  n = len(y)
  yhat = weighted_average(y, weights)
  variance = weighted_average(((y-yhat)**2)*n/(n-1), weights)
  return variance

In [None]:
summaries = pd.DataFrame({'weighted_mean':  weighted_average(agstrat_py['acres92'], agstrat_py['strwt']),

'weighted_total': weighted_total(agstrat_py['acres92'], agstrat_py['strwt']),

'weighted_std': weighted_var(agstrat_py['acres92'],agstrat_py['strwt'])
}, index = [1])


summaries

In [None]:
strata_mean = agstrat_py.groupby('region').apply(lambda x: weighted_average(x['acres92'], x['strwt']), include_groups=False)


strata_total = agstrat_py.groupby('region').apply(lambda x: weighted_total(x['acres92'], x['strwt']), include_groups=False)

strata_var = agstrat_py.groupby('region').apply(lambda x: weighted_var(x['acres92'], x['strwt']), include_groups=False)


strata_aggs = pd.concat([strata_mean, strata_total, strata_var], axis=1)
strata_aggs.columns = ['means', 'totals', 'var']
strata_aggs