<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Review CLT, Confidence Intervals, and Hypothesis Testing


---

### Read in the housing data (code provided).

You can find the original data [here](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data).

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [6]:
variable_descriptions = {
    "CRIM": "Per capita crime rate by town",
    "ZN": "Proportion of residential land zoned for lots over 25,000 sq.ft.",
    "INDUS": "Proportion of non-retail business acres per town",
    "CHAS": "Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)",
    "NOX": "Nitric oxides concentration (parts per 10 million)",
    "RM": "Average number of rooms per dwelling",
    "AGE": "Proportion of owner-occupied units built prior to 1940",
    "DIS": "Weighted distances to five Boston employment centres",
    "RAD": "Index of accessibility to radial highways",
    "TAX": "Full-value property-tax rate per $10,000",
    "PTRATIO": "Pupil-teacher ratio by town",
    "B": "1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town",
    "LSTAT": "% lower status of the population",
    "MEDV": "Median value of owner-occupied homes in $1000's"
}


In [83]:
#Importing Raw Data from the url
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
raw_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.00632,18.00,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3
1,396.90000,4.98,24.00,,,,,,,,
2,0.02731,0.00,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.90000,9.14,21.60,,,,,,,,
4,0.02729,0.00,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8
...,...,...,...,...,...,...,...,...,...,...,...
1007,396.90000,5.64,23.90,,,,,,,,
1008,0.10959,0.00,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0
1009,393.45000,6.48,22.00,,,,,,,,
1010,0.04741,0.00,11.93,0.0,0.573,6.030,80.8,2.5050,1.0,273.0,21.0


In [82]:
#Realigning the data into 14 columns for the ODD numbered Rows
new_df = pd.DataFrame()

for i in range(len(raw_df)):
    if i % 2 == 1:
        previous_row = raw_df.iloc[i - 1:i :].reset_index(drop=True)
        current_row = raw_df.iloc[i:i + 1, :3].reset_index(drop=True)
        combined_row = pd.concat([previous_row, current_row], axis=1)
        new_df = pd.concat([new_df, combined_row], ignore_index=True)

new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,0.1,1.1,2.1
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,396.9,4.98,24.0,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.00,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.00,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.00,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.00,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.00,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.00,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.00,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.00,21.0,393.45,6.48,22.0


In [98]:
#Renaming the columns into the appropriate Names based on the dictionary keys
titles = []
for key in variable_descriptions.keys():
    titles.append(key)
new_df.columns = titles
new_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,396.9,4.98,24.0,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.00,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.00,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.00,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.00,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.00,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.00,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.00,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.00,21.0,393.45,6.48,22.0


In [99]:
#For some reason row 0 data for RAD, TAX PTRATIO is wrong so will be directly reimporting it from the raw data
new_df.iloc[0:1, 0:11] = raw_df.iloc[0:1, 0:11]
new_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [101]:
#Setting variables for the question
NOX = new_df['NOX']
AGE = new_df['AGE']

### 1. Find the mean, standard deviation, and the standard error of the mean for variable `AGE`

In [126]:
# A:
from scipy import stats
mean_AGE = np.mean(AGE)
sd_AGE = np.std(AGE)
sem_AGE = stats.sem(AGE)
print("Mean of AGE Column: ", mean_AGE)
print("Standard Deviation of AGE Column: ", sd_AGE)
print("Standard Error of the Mean (SEM): ", sem_AGE)

Mean of AGE Column:  68.57490118577076
Standard Deviation of AGE Column:  28.121032570236885
Standard Error of the Mean (SEM):  1.2513695252583041


In [127]:
# scipy standard error function
from scipy.stats import sem
sem(AGE)

1.2513695252583041

### 2. Generate a 90%, 95%, and 99% confidence interval for `AGE`

You can use the `scipy.stats.t.interval` function to calculate confidence interval range.

```python
# Endpoints of the range that contains alpha percent of the distribution
stats.t.interval(confidence, df, loc=0, scale=1)	
```

Arguments:
- `confidence` = confidence level, between 0 and 1
- `df` = the degrees of freedom, will be the length of the vector -1.
- `loc` = the mean of the t-distribution (your point estimate - mean of the variable)
- `scale` = the standard deviation of the t-distribution (the standard error of your sample mean)

**Interpret the results from all three confidence intervals.**

In [128]:
from scipy.stats import t

In [129]:
# A: 
print('90 %: ',np.mean(AGE)+(stats.norm().ppf([0.05,0.95]))*sem_AGE)
print('95 %: ',np.mean(AGE)+(stats.norm().ppf([0.025,0.975]))*sem_AGE)
print('99 %: ',np.mean(AGE)+(stats.norm().ppf([0.005,0.995]))*sem_AGE)

90 %:  [66.51658148 70.63322089]
95 %:  [66.12226198 71.02754039]
99 %:  [65.35158689 71.79821548]


### 3. Did you rely on the Central Limit Theorem in question 2? Why or why not? Explain.

# A:
## We are using SEM and z scores to calculate the confidence interval which does not directly rely on the CLT. It relies on normal distribution. In our case the calculation assumes that the distribution of age is approximately normal and that the sample size is large enough for the normal approximation to be valid

### 4. For the variable `NOX`, generate a 95% confidence interval and interpret it.

In [131]:
# A:
mean_NOX = np.mean(NOX)
sd_NOX = np.std(NOX)
sem_NOX = sem(NOX)
print("Mean of NOX: ", mean_NOX)
print("SD of NOX: ", sd_NOX)
print("SEM of NOX: ", sem_NOX)


Mean of NOX:  0.5546950592885376
SD of NOX:  0.11576311540656153
SEM of NOX:  0.005151391024028489


In [132]:
print('95 %: ',mean_NOX+(stats.norm().ppf([0.025,0.975]))*sem_NOX)

95 %:  [0.54459852 0.5647916 ]


### 5. For the variable `NOX`, we are going to test the hypothesis that the (true) mean is equal to the median in the sample

In this case, we are performing the hypothesis test to test the mean based on a single sample.
These are the steps:
1. Define hypothesis
2. Set alpha (Let alpha = 0.05)
3. Calculate point estimate
4. Calculate test statistic
5. Find the p-value
6. Interpret results

In [133]:
# A:
## Step 1: Define hypotheses.
### H_0: mu_NOX = M_NOX
### H_A: mu_NOX != M_NOX

## Step 2: alpha = 0.05.
alpha = 0.05

## Step 3: Calculate point estimate.
sample_mean = NOX.mean()
sample_median = 0.54
sample_std = NOX.std()
sample_size = len(NOX)

## Step 4: Calculate test statistic.
t_statistic = (sample_mean - sample_median)/(sample_std/sample_size**0.5)

## Step 5: Find p-value.
## t.sf is survival function, which is 1-cdf at a given value 
## (proportion of values at least as extreme as...)
p_value = t.sf(np.abs(t_statistic), len(NOX)-1) * 2 


## Because our alternative hypothesis is != (rather than greater than or less than),
## we multiply our p-value by 2. (This is called a two-sided test.)
print("Our sample median is {:.4f}.".format(0.54))
print("Our sample mean is {:.4f}.".format(sample_mean))
print("Our t-statistic is {:.6f}.".format(t_statistic))
print("Our p-value is {:.6f}.".format(p_value))

if p_value < alpha:
    print("We reject our null hypothesis and conclude that the true mean NOX value is different from the median NOX value.")
elif p_value > alpha:
    print("We fail to reject our null hypothesis and cannot conclude that the true mean NOX value is different from the median .")
else:
    print("Our test is inconclusive.")

Our sample median is 0.5400.
Our sample mean is 0.5547.
Our t-statistic is 2.852639.
Our p-value is 0.004514.
We reject our null hypothesis and conclude that the true mean NOX value is different from the median NOX value.


**1-sample t-test**

To perform the t-test on a single sample, you can use `scipy.stats.ttest_1samp()`.

Try it out. Do you get the same values?

In [211]:
from scipy import stats
import random
sample_array = []
for i in range(0, 50):
    sample_array.append(NOX.iloc[random.randint(0, 505)])
experimental = np.array(sample_array)
#experimental = sample_array

control_array = []
for i in range(0, 50):
    control_array.append(np.mean(NOX))
#control = NOX.iloc[0:50]
control = np.array(control_array)
print(experimental)
print(control)
t_statistic, p_value = stats.ttest_ind(experimental, control)

alpha = 0.05  # Significance level

if p_value < alpha:
    print("Reject the null hypothesis (H0)")
    print("t-statistic:", t_statistic)
    print("p-value:", p_value)
else:
    print("Fail to reject the null hypothesis (H0)")
    print("t-statistic:", t_statistic)
    print("p-value:", p_value)

[0.532  0.647  0.515  0.693  0.524  0.493  0.624  0.544  0.693  0.52
 0.871  0.585  0.507  0.431  0.493  0.389  0.538  0.431  0.713  0.4161
 0.679  0.392  0.52   0.411  0.659  0.609  0.409  0.448  0.437  0.51
 0.614  0.624  0.573  0.52   0.538  0.4    0.597  0.413  0.445  0.581
 0.585  0.532  0.74   0.488  0.679  0.605  0.871  0.585  0.668  0.453 ]
[0.55469506 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506
 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506
 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506
 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506
 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506
 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506
 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506
 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506 0.55469506
 0.55469506 0.55469506]
Fail to reject the null hypothesis (H0)
t-statistic: 0.011508434963174979
p-value: 0.9908

  t_statistic, p_value = stats.ttest_ind(experimental, control)


## Running it multiple times you get different outcomes depending on what sample is taken

### 6. What do you notice about the results from Exercise 4 and Exercise 5? 

**If you were going to generalize this to the relationship between hypothesis tests and confidence intervals, what might you say? Be specific.**

## Confidence Intervals give a range of values which we can be confident that the true population parameter such as mean lies in

## Hypothesis test involve making specific claims about population parameter (null Hypothesis) and testing those claims based on sample data

## They are related concepts that can complement each other when making inferences about population parameters. The decision to reject or accept the null hypothesis in a test is whether the specific value falls within or outside the confidence interval