# Core Statistics Using Python
### Hana Choi, Simon Business School, University of Rochester


# Estimators: mean, var, sd, cov, cor in Python


## Topics covered

- Example 1: Survey
- Example 2: Consumers' willingness to pay
- Example 3: Stocks

## Here are the packages/modules we need for this notebook

In [None]:
import pandas as pd
import numpy as np

# Example 1: Survey

- Draws from a Bernoulli distribution with mean 0.5 (i.e. the probability of 1 is 0.5 and the probability of 0 is 0.5).
- The underlying population is "infinite".
- Population mean = p = 0.5, and population variance = p(1-p) = 0.5*0.5 = 0.25

## Data: Sample of 1000 observations

In [None]:
# Randomly pick 1000 observations from (0, 1) with replacement

# Define the possible values (0 and 1)
values = [0, 1]

# Randomly pick 1000 observations with replacement
survey_sample = np.random.choice(values, size=1000, replace=True)

# Display the first 20 observations
print(survey_sample[0:19]) 

## Sample mean

In [None]:
print(np.mean(survey_sample[0:9]),": First 10 observations")
print(np.mean(survey_sample[10:19]), ": Second 10 observations") 
print("----")
print(np.mean(survey_sample[0:49]), ": First 50 observations")
print(np.mean(survey_sample[50:99]), ": Second 50 observations")
print("----")
print(np.mean(survey_sample[0:199]), ": First 200 observations") 
print(np.mean(survey_sample[0:399]), ": First 400 observations")
print(np.mean(survey_sample[0:749]), ": First 750 observations")
print("----")
print(np.mean(survey_sample), ": Full sample") 

## Sample variance

In [None]:
print(np.var(survey_sample[0:9]),": First 10 observations")
print(np.var(survey_sample[10:19]), ": Second 10 observations") 
print("----")
print(np.var(survey_sample[0:49]), ": First 50 observations")
print(np.var(survey_sample[50:99]), ": Second 50 observations")
print("----")
print(np.var(survey_sample[0:199]), ": First 200 observations") 
print(np.var(survey_sample[0:399]), ": First 400 observations")
print(np.var(survey_sample[0:749]), ": First 750 observations")
print("----")
print(np.var(survey_sample), ": Full sample") 

# Example 2: Consumers' Willingness to Pay (WTP)

- Drawn from a normal distribution with mean 100 and sd 10.
- The underlying population is "infinite".

## Data: Sample of 1000 observations

In [None]:
# Randomly pick 1000 observations from N(100, 10^2)

# Set the mean and standard deviation
mean = 100
std_dev = 10

# Generate 1000 random observations
wtp = np.random.normal(loc=mean, scale=std_dev, size=1000)

# Display the first 20 observations
print(wtp[0:19])

## Sample mean and standard deviation

In [None]:
# First 10 observations
print("First 10 observations")
print( np.mean(wtp[0:9]), np.std(wtp[0:9]) )
print("----")

# First 20 observations
print("First 20 observations")
print( np.mean(wtp[0:19]), np.std(wtp[0:19]) )
print("----")

# First 100 observations
print("First 100 observations")
print( np.mean(wtp[0:99]), np.std(wtp[0:99]) )
print("----")

# Full sample
print("Full sample")
print( np.mean(wtp), np.std(wtp) )

# Example 3: Stocks

## Data: stockmarket.csv

In [None]:
# Method 1: Save the data file directly to your working directory
# stocks = pd.read_csv('stockmarket.csv')

# Method 2: Tell Python where your data file exists "explicitly"
# Below is "my" file path, you should specify yours instead.
stocks = pd.read_csv("/Users/hanachoi/Dropbox/teaching/core_statistics/Data/stockmarket.csv")

# Method 3: We can also import a dataset from the web
# stocks = pd.read_csv("http://hanachoi.github.io/datasets/stockmarket.csv")

# Display the first few rows of the dataframe
stocks.head()

## S&P 500

In [None]:
# Calculate the sample mean and sd for the column named S&P500
sp500_mean = stocks.SP500.mean()
sp500_sd = stocks.SP500.std()

print(sp500_mean, sp500_sd)

## Nasdaq

In [None]:
# Calculate the sample mean and sd for the second column (Nasdaq)
# Note: index starts from 0
nasdaq_mean = stocks.iloc[:,2].mean()
nasdaq_sd = stocks.iloc[:,2].std()

print(nasdaq_mean, nasdaq_sd)

## Covariance

In [None]:
# Variance-covariance matrix between SP500 and Nasdaq

# Select columns by their names
print(stocks[['SP500', 'Nasdaq']].cov() )

# Select columns by their indices
print(stocks.iloc[:, [1,2]].cov())

In [None]:
# Correlation matrix between SP500 and Nasdaq

# Select columns by their names
print(stocks[['SP500', 'Nasdaq']].corr() )

# Select columns by their indices
print(stocks.iloc[:, [1,2]].corr())