# Reweighting Sample Data (with simulated data)

In [3]:
import pandas as pd
import numpy as np
from collections import Counter
from scipy import stats as stats
from scipy.stats import uniform, norm, expon


The code chunk below generates simluated data with 1000 rows.  
The population is comprised of 4 groups (A, B, C, D).  
These groups are equally weighted in the population. (all proportions = 25%)  
The sample is generated with proportions: 28% A, 40% B, 12% C, 20% D.  

The groups differ in average age.  Mean ages: A = 30, B = 40, C = 60, D = 70.

In [4]:
pop = {"A": .25, "B": .25, "C": .25, "D": .25}
avg_age = {"A": 30, "B": 40, "C": 60, "D": 70}
std_dev_age = 5

samp = {"A": .28, "B": .4, "C": .12, "D": .20}
samp_size = 1000
samp_data = pd.DataFrame({'group': [i for i in pop.keys() for x in range(np.rint(samp[i] * samp_size).astype(int))]})
t = [(norm.rvs(size = 1, loc = avg_age[i], scale = std_dev_age)).item() for i in pop.keys() for x in range(np.rint(samp[i] * samp_size).astype(int))]
samp_data['age'] = t

samp_data

Unnamed: 0,group,age
0,A,24.663602
1,A,27.692858
2,A,30.756850
3,A,40.641765
4,A,30.578077
...,...,...
995,D,63.110311
996,D,70.526809
997,D,64.118281
998,D,64.087023


The code chunk below creates a dataframe with rows summarizing the sample data.

In [9]:
compare = pd.DataFrame(pop, index = ['population_percent'])
compare.loc['avg_age'] = avg_age
compare.loc['pop_expected'] = compare.loc['population_percent']* len(samp_data) 
compare.loc['observed'] = Counter(samp_data['group'])
compare.loc['observed_percent'] = compare.loc['observed']/len(samp_data) 
compare

Unnamed: 0,A,B,C,D
population_percent,0.25,0.25,0.25,0.25
avg_age,30.0,40.0,60.0,70.0
pop_expected,250.0,250.0,250.0,250.0
observed,280.0,400.0,120.0,200.0
observed_percent,0.28,0.4,0.12,0.2


Average age in population by group.

In [7]:
compare.loc['avg_age']

A    30.0
B    40.0
C    60.0
D    70.0
Name: avg_age, dtype: float64

Expected average age for total population.

In [None]:
np.average(compare.loc['avg_age'])

50.0

Average age for sample by group.  
Note that group averages match population group averages closely.

In [None]:
samp_data.groupby('group')['age'].mean()

group
A    29.938735
B    40.429851
C    60.385834
D    70.579009
Name: age, dtype: float64

Average age for total sample.  
Note that the average age is lower than the expected value of 50 for the population.

In [None]:
np.average(samp_data['age'])

45.680701975224686

Chi-Square test of goodness of fit.  

In [None]:
stats.chisquare(compare.loc['observed'], 
                f_exp = compare.loc['pop_expected']).pvalue

7.008082353145108e-37

Calculated weights for sample data.

In [None]:
weights = pd.DataFrame(compare.loc['population_percent'] / compare.loc['observed_percent'], 
                       columns = ['weights'])
weights

Unnamed: 0,weights
A,0.892857
B,0.625
C,2.083333
D,1.25


Adding weights to samp_data as new column.

In [None]:
samp_data = samp_data.join(weights, on='group')
samp_data


Unnamed: 0,group,age,weights
0,A,37.370051,0.892857
1,A,28.966382,0.892857
2,A,31.378335,0.892857
3,A,34.652236,0.892857
4,A,29.980256,0.892857
...,...,...,...
995,D,67.038296,1.250000
996,D,66.682873,1.250000
997,D,77.017825,1.250000
998,D,73.254424,1.250000


Weighted average for total sample.  
Note that weighted average closely matches the expected value for the population.

In [None]:
np.average(a = samp_data['age'], 
           weights = samp_data['weights'])

50.333357352286725

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>