# Population and Sample

In [2]:
# Import libraries
import pandas as pd 
import numpy as np

## Population

In [3]:
# Create a Population Dataframe with 50 random numbers 

df = pd.DataFrame()
df['Population']=np.random.randint(1,1000, size=50)

df.head() 

Unnamed: 0,Population
0,357
1,234
2,235
3,29
4,847


## Sampling 

In [5]:
# Create sample with sampling size = 10 
# there are two types - replace = True / False 
# True means the value will be able to chose again vice versa 

sample_with_replacement = df['Population'].sample(10, replace=True)
sample_without_replacement = df['Population'].sample(10, replace=False)

print(sample_with_replacement)
print()
print(sample_without_replacement)

46    106
7     627
4     847
35    340
14    875
6     940
40    351
2     235
46    106
28    478
Name: Population, dtype: int32

24    798
44    936
34    222
2     235
16    731
14    875
41    699
45    643
11    693
28    478
Name: Population, dtype: int32


## Parameters and Statistics 

In [16]:
# Calculate the mean, variance, and standard deviation
mean_population = df['Population'].mean()
var_population = df['Population'].var(ddof=0)
std_population = df['Population'].std(ddof=0)

# Print the calculated values
print(f"Mean: {mean_population:.2f}")
print(f"Variance: {var_population:.2f}")
print(f"Standard Deviation: {std_population:.2f}")

Mean: 497.28
Variance: 70956.44
Standard Deviation: 266.38


In [17]:
# Randomly sample 50 population numbers without replacement
sample_50 = df['Population'].sample(50, replace=False)

# Calculate the sample mean, sample variance, and sample standard deviation
mean_sample = sample_50.mean()
var_sample = sample_50.var(ddof=1)
std_sample = sample_50.std(ddof=1)

# Print the calculated values
print(f"Sample Mean: {mean_sample:.2f}")
print(f"Sample Variance: {var_sample:.2f}")
print(f"Sample Standard Deviation: {std_sample:.2f}")

Sample Mean: 497.28
Sample Variance: 72404.53
Sample Standard Deviation: 269.08


## Demo why ddof = 1 for sampling 

In [23]:
# Demo why ddof = 1 for sampling 

sample_length = 1000
sample_variance_collection0 = [df['Population'].sample(50, replace=True).var(ddof=0) for i in range(sample_length)]
sample_variance_collection1 = [df['Population'].sample(50, replace=True).var(ddof=1) for i in range(sample_length)]

print(f"Population Variance: {var_population:.2f}")
print(f"Average of sample variance with n is: {pd.DataFrame(sample_variance_collection0)[0].mean():.2f}")
print(f"Average of sample variance with n-1 is: {pd.DataFrame(sample_variance_collection1)[0].mean():.2f}")

# Sample variance of ddof = 1 is closer to the population variance
# ddof=1, --> account for the loss of one degree of freedom due to the estimation of the sample mean, 
# resulting in a more accurate estimation of the population

Population Variance: 70956.44
Average of sample variance with n is: 69464.21
Average of sample variance with n-1 is: 71224.76


# Variation of Sample

In [26]:
# import libraries 
import pandas as pd 
import numpy as np 
from scipy.stats import norm 
import matplotlib.pyplot as plt
%matplotlib inline

## Sample mean and SD keep changing, but always within a certain range

In [35]:
# Generate a sample of size 50 from a normal distribution with mean 10 and standard deviation 5
Fstsample = pd.DataFrame(np.random.normal(10, 5, size=50))

# Calculate the sample mean and sample standard deviation
sample_mean = Fstsample[0].mean()
sample_std = Fstsample[0].std(ddof=1)

# Print the sample mean and sample standard deviation
print('Sample Mean:', sample_mean)
print('Sample Standard Deviation:', sample_std)

Sample Mean: 10.583994075050677
Sample Standard Deviation: 5.686576391912332


## Empirical Distribution of mean

In [None]:
meanlist = []
for t in range(10000):
    sample = pd.DataFrame(np.random.normal(10, 5, size=30))
    meanlist.append(sample[0].mean())

In [None]:
collection = pd.DataFrame()
collection['meanlist'] = meanlist

In [None]:
collection['meanlist'].hist(bins=100, normed=1,figsize=(15,8))

## Sampling from arbritary distribution

In [None]:
# See what central limit theorem tells you...the sample size is larger enough, 
# the distribution of sample mean is approximately normal
# apop is not normal, but try to change the sample size from 100 to a larger number. The distribution of sample mean of apop 
# becomes normal.
sample_size = 100
samplemeanlist = []
apop =  pd.DataFrame([1, 0, 1, 0, 1])
for t in range(10000):
    sample = apop[0].sample(sample_size, replace=True)  # small sample size
    samplemeanlist.append(sample.mean())

acollec = pd.DataFrame()
acollec['meanlist'] = samplemeanlist
acollec.hist(bins=100, normed=1,figsize=(15,8))