# Statistics Codebook

## Imports

In [None]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats import weightstats as stests # z-test code
from statsmodels.stats.proportion import proportions_ztest # proportions z-test
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import chi2_contingency # chi-squared test with similar proportions
from scipy.stats import chi2
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

## Terminology

In [None]:
# TYPES OF ERRORS IN CALCULATIONS
Type 1 errors (false positives) are when we accept an alternative hypothesis which is actually false.
The  that we pick is the likelihood that we will get a type 1 error due to random chance.
Type 2 errors (false negatives) are when we reject an alternative hypothesis which is actually true.

In [None]:
# STATING HYPOTHESIS SOLUTION
if (results[0]>t_crit) and (results[1]<alpha):
    print ("Null hypothesis rejected. Results are statistically significant with t-value =", 
    round(results[0], 2), "critical t-value =", t_crit, "and p-value =", np.round((results[1]), 10))
else:
    print ("Fail to reject the Null hypothesis with t-value =", 
    round(results[0], 2), ", critical t-value =", t_crit, "and p-value =", np.round((results[1]), 10))

In [None]:
#HYPOTHESIS TESTING - STEPS
*1. Write down the null and alternative hypothesis you are testing.* 
*2. Select the appropriate test and calculate the test statistic and P-values.*
*3. Determine the critical value for the 95% confidence interval.*
*4. Evaluate the test statistic agains the critical value.*
*5. Determine if you reject or fail to reject the null hypothesis and write a sentence explaining the results of your hypothesis test.*  

## Methodology

In [None]:
# CLEAN DATA
all_data['PHYSHLTH'] = np.where((all_data.PHYSHLTH == 88),0,all_data.PHYSHLTH)
all_data.drop(all_data[all_data['PHYSHLTH'].isin([77,99])].index, inplace = True)
all_data.drop((all_data[all_data['SEX'] == 9].index), inplace = True)
all_data['chronic'] = np.where(all_data['PHYSHLTH']>16, 1, 0)
all_data.dropna(subset=['SMOKE100', 'SMOKDAY2'], how='all', inplace=True)
conditions = [
((all_data['ALCDAY5'] > 100) & (all_data['ALCDAY5'] < 108)), 
((all_data['ALCDAY5'] > 200) & (all_data['ALCDAY5'] < 231)),
(all_data['ALCDAY5'] == 888)
]
choices = [ (all_data['ALCDAY5']-100)*4, (all_data['ALCDAY5']-200), 0 ]
all_data['DAYSDRNK'] = np.select(conditions, choices, default=99)

In [None]:
# INSPECT DATA
all_data.head(), .describe(), .all_data['COLUMN'].value_counts()
chron_data = all_data.groupby('_STATE')['chronic'].value_counts(normalize=True)
y_vals = chron_data.iloc[chron_data.index.get_level_values('chronic') == 1]

In [None]:
# PLOTTING
fig, ax = plt.subplots()
y_vals = list(all_data['PHYSHLTH'])

ax.boxplot(y_vals)
ax.hist(y_vals, bins=15)
ax.bar(x_vals, y_vals)

#multiple hist
ax.hist(x_conn, bins=15, histtype='step', density=True, label='Conn')
ax.hist(x_nj, bins=15, histtype='step', density=True, label='NJ')
ax.hist(x_ny, bins=15, histtype='step', density=True, label='NY')

#muliple bars
ax.bar(x_vals, not_at_all_vals)
ax.bar(x_vals, everyday_vals)
ax.bar(x_vals, somedays_vals)

ax.set_ylabel("# Days Sick")
ax.set_title("Title")

In [None]:
# Getting 95% confidence interval from data- Is there a relationship between the number of Facebook likes for a cast and the box office gross of the movie?
- Do foreign films perform differently at the box office than non-foreign films?
- Of all movies created are 40% rated R?
- Is there a relationship between the language of a film and the content rating (G, PG, PG-13, R) of that film?
- Is there a relationship between the content rating of a film and its budget? 
df.dropna(subset=['gross'], inplace=True)
df_r2k = df[(df['title_year'] > 2000) & (df['content_rating'] == 'R')]
r2k_mean = df_r2k['gross'].mean()
r2k_std = df_r2k['gross'].std()
r2k_n = df_r2k['gross'].count()
r2k_ci = stats.norm.interval(.95, loc=r2k_mean, scale=r2k_std/np.sqrt(r2k_n))

print(f'mean = {r2k_mean}')
print(f'std = {r2k_std}')
print(f'n = {r2k_n}')
print(f'The 95% confidence interval is {r2k_ci}')

### Determining the type of test to use & null/alt hypothesis

![img](img/choosing_test.png)

1. Is there a relationship between the number of Facebook likes for a cast and the box office gross of the movie?
* Simple Linear Regression
* H0: The number of FB likes for a cast and the box office gross of a movie is related (beta = 0)
* Ha: The number of FB likes for a cast and the box office gross of a movie is not related (beta != 0) 
<br><br>


2. Do foreign films perform differently at the box office than non-foreign films?
* Two sided t-test
* H0: Foreign films perform the same at the box office than non-foreign films
* Ha: There is a significant difference in performance at the box office between foreign and non-foreign films 
<br><br>


3. Of all movies created are 40% rated R?
* One sided z test
* H0: 40% of all movies created are rated R (P = .40)
* Ha: 40% of all movies created are not rated R (P != .40) 
<br><br>


4. Is there a relationship between the language of a film and the content rating (G, PG, PG-13, R) of that film?
* Chi-Squared test
* H0: Distributions of content ratings are correlated to the language of the film
* Ha: Distributions of content ratings are not equal to the language of the film 
<br><br>


5. Is there a relationship between the content rating of a film and its budget? 
* ANOVA
* H0: The content rating of a film is directly correlated to its budget (content rating means are equal)
* Ha: There is no relationship between content rating of a film and budget (means not equal, null hypothesis not true) 

## Z-score

#### By Hand

In [None]:
#ONE-SAMPLE PERCENTAGE DATA VERSUS POPULATION Z-TEST
z_stat = (x1_mean - pop_mean) / np.sqrt((pop_mean*(1-pop_mean))/n_count) #(x_hat - mu) / (std)

#Z-score for when we are working with a sampling distribution:
z = (x_bar - mu)/(std/np.sqrt(n))

#Critical Value of the Z score
z_critical_val = st.norm.ppf(1-.05) # Where alpha is '.05'

#Crit val for Z scores where comparing in 97% confidence
zcrit_val1 = st.norm.ppf(1-.025)
zcrit_val2 = st.norm.ppf(1-.975)

#TWO-VARIABLE VERSUS EACH OTHER PERCENTAGE DATA Z-TEST
p_val = (male_count_chronic + female_count_chronic) / (male_count + female_count)
z_stat = np.round((male_mean - female_mean) / np.sqrt((p_ast*(1-p_ast)*(1/male_count + 1/female_count))),3)
z_stat_calc = np.round(stats.ttest_ind(male_sample, female_sample, equal_var=True),3)

#### Python function one sided Z-score from statsmodels

In [None]:
#Statsmodels
z_stat, p_value = proportions_ztest(count=sample_count, nobs=n, value=0.10, alternative='two-sided')

#https://towardsdatascience.com/hypothesis-testing-in-machine-learning-using-python-a0dc89e169ce
ztest ,pval = stests.ztest(df['bp_before'], x2=None, value=156)
print(float(pval))
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

#### Python function two sided Z-score from statsmodels

In [None]:
# https://towardsdatascience.com/hypothesis-testing-in-machine-learning-using-python-a0dc89e169ce
ztest ,pval1 = stests.ztest(df['bp_before'], x2=df['bp_after'], value=0,alternative='two-sided')
print(float(pval1))
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

#### Getting Percentile & Probability (hypothesis testing)

In [None]:
# We can use stats to calculate the percentile / probablility of getting given z score OR higher
print("Percentile = ", stats.norm.cdf(z)) # can use for t-value also

# We can also use the survival function to calculate the probability
print("Probability = ", stats.norm.sf(z)) # can use for t-value also

## T-score

* Comparing 2 means to see if they are equal or not equal. Correlation does not equal causation however.

#### T-critical value for 1 tailed test (hypothesis testing)

In [None]:
# CALCULATING T-CRITICAL VALUE
t_critical = stats.norm.ppf(p_value)
t_critical = stats.t.ppf(p_value, df=sample_num-1) #one-sample vs population t-test
t_critical = stats.t.ppf(1-0.025, (male_count+female_count-2))
p_value = stats.norm.pdf(z-score)
p_value = stats.norm.cdf(z-score)

In [None]:
# Using Python to get the t-statistic & P-value for a 1 sample t-test:
stats.ttest_1samp(std, mu) #(standard deviation, mean)

#### T-critical value for 2 tailed test (hypothesis testing)

In [None]:
# TWO-TAILED ONE-SAMPLE VERSUS POPULATION T-TEST
t_stat = (sample_mean - 4)/((sample_std/np.sqrt(sample_num)))
results = stats.ttest_1samp(a=df['column_name'], popmean=4)
 
# TWO-SAMPLE CONTINUOUS DATA VERSUS EACH OTHER T-TEST
t_statistic = (x1_mean - x2_mean) / (np.sqrt(pool_var*((1/x1_count)+(1/x2_count))))
t_stat, p_val = stats.ttest_ind(x1_sample, x2_sample, equal_var=True)

# TWO-SAMPLE CONTINUOUS DATA BEFORE AND AFTER PAIRED T-TEST (same size)
stats.ttest_rel(x1_sample, x2_sample)

In [None]:
# In Python to get the t-statistic & P-value
# This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values. 
# This test assumes that the populations have identical variances by default.
# Equal variance of false if comparing different size data
stats.ttest_ind(sample_data_1, sample data_2, equal_var=False, nan_policy='omit') # nan_policy will omit the nan values in test.

In [None]:
# This is a two-sided test for the null hypothesis that two independent samples have identical average (expected) values.
# T-test for means of two independent samples from descriptive statistics.
# T-test from data provided to get the statistics and p-value
# nobs = number of observations aka n                                                                                
stats.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True, alternative='two-sided')

In [None]:
# Calculate our t-critical value for 2 tailed test (.025 & .975) = 97% confidence
print(stats.t.ppf(0.025, n-1)) # The degrees of freedom is (n-1)
print(stats.t.ppf(0.975, n-1)) # The significance level is 97%

# OR

st.t.ppf(1-(.05/2), (n1 + n2)-2) # in one line - alpha devided by 2

In [None]:
# Two sided t-test from data frame
# Assuming an alpha of .05
df[df['country'] == 'USA'].dropna(subset=['country'])
df[df['country'] != 'USA'].dropna(subset=['country'])

domestic = df[df['country'] == 'USA']['gross']
foreign = df[df['country'] != 'USA']['gross']
domestic_mean = domestic.mean()
foreign_mean = foreign.mean()
domestic_std = domestic.std()
foreign_std = foreign.std()
domestic_n = domestic.count()
foreign_n = foreign.count()

print(stats.ttest_ind(foreign, domestic, equal_var=True, nan_policy='omit'))
print("As the P_val is less than the alpha of .05, we reject the null hypothesis that foreign and domestic films perform the same at the box office")

#### Confidence Interval for Normally Distributed Data (margin of error)

In [None]:
# CONFIDENCE INTERVAL FOR VALUE-based DATA
standard_error = sample_std / np.sqrt(sample_num)
lower_limit = sample_mean - (1.96 * standard_error)
upper_limit = sample_mean + (1.96 * standard_error)

In [None]:
# CONFIDENCE INTERVAL FOR PERCENT-based DATA
standard_error = np.sqrt((data_proportion*(1-data_proportion))/n_count)
lower_limit = data_proportion - (z_val*standard_error)
upper_limit = data_proportion + (z_val*standard_error)

In [None]:
pop_std / np.sqrt(n) * z # or (t)

#### Confidence Intervals for Non-Normally Distributed Data

In [None]:
n = ?
mean = ?
t_value = stats.t.ppf(0.95, n-1) # n-1 is the degrees of freedom, # The significance level is 97%
margin_error = std / (np.sqrt(n)) * t
confidence_interval = (mean - margin_error, mean + margin_error)

#### Confidence Intervals for Binomial Distribution

left endpt.: $\hat{p} - z\times\sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}$ <br/>
right endpt.: $\hat{p} + z\times\sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}$

In [None]:
p_hat = ?
n = ?
z = stats.norm.ppf(0.975)
step = z * np.sqrt(p_hat * (1-p_hat) / n) 

confidence_interval = (p_hat - step, p_hat + step)

## ANOVA - the $F$ test

$F = \frac{s^2_{between}}{s^2_{within}}$

In [None]:
# create random array of data
np.random.seed(42)
one = np.random.normal(0, 3, 100) #(center, std, n-points)
two = np.random.normal(1, 3, 100)

In [None]:
# The "one-way" just means that there is a single input variable.

stats.f_oneway(one, two) # Can insert more than two samples.

In [None]:
# Identical p_values

t = stats.ttest_ind(one, two, equal_var=True) # t-statistic squared = f-statistic, Variance should be the same (P-value)

In [None]:
# The square of the two-sample t-stat = the F-stat
t.statistic**2

In [None]:
#ONE-WAY CONTINUOUS DATA MULTIPLE VARIABLE F-TEST
f_data = {each_state:sample_populations['column_name'][sample_populations['subset_name'] == x] for x in n}
f_stat, p_val = stats.f_oneway(f_data[9], f_data[34], f_data[36])

## Proportions

* Formula to get the test statistic is : 
z = (p-p0) / √p0(1-p0)/n

#### By Hand

In [None]:
#ONE-SAMPLE PERCENTAGE DATA VERSUS POPULATION Z-TEST
z_stat = (x1_mean - pop_mean) / np.sqrt((pop_mean*(1-pop_mean))/n_count)

In [None]:
#TWO-VARIABLE VERSUS EACH OTHER PERCENTAGE DATA Z-TEST
p_val = (male_count_chronic + female_count_chronic) / (male_count + female_count)
z_stat = np.round((male_mean - female_mean) / np.sqrt((p_ast*(1-p_ast)*(1/male_count + 1/female_count))),3)
z_stat_calc = np.round(stats.ttest_ind(male_sample, female_sample, equal_var=True),3)

In [None]:
p_hat = x/n # Where x is the sample data and n is the total count of data

In [None]:
# 1 sample z prop test
# if expected proportion = p1
p0 = x_hat.mean()
st_error = ((p0 * (1-p0))/ len(p0)**.5 # can use np.sqrt(p0) for square root
z_stat = (p1 - p0) / st_error
z_stat

zcrit_val = st.norm.ppf(1-.05)

In [None]:
plan_null = 'H0: There is no statistically significant difference in the percentage of men and women who have a healthcare plan.'
plan_alt = 'H1: There is a statistically significant difference in the percentage of men and women who have a healthcaare plan.'

# 2 z prop test
male_plan = df[df['SEX']==1]['HLTHPLN1']
female_plan = df[df['SEX']==2]['HLTHPLN1']

yesplan_m = male_chron[male_chron==1]
yesplan_f = female_chron[female_chron==1]

prob = (len(yesplan_m) + len(yesplan_f)) / (len(male_plan) + len(female_plan))
p_hat_plan_male = len(yesplan_m)/len(male_plan)
p_hat_plan_female = len(yesplan_f)/len(female_plan)
den = (1/(len(male_plan)) + (1/len(female_plan)))
num_plan = p_hat_plan_male - p_hat_plan_female
den_plan =  (prob * (1-prob) * den)**.5

#z statistic 
plan_z_2samp = num_plan / den_plan

#crit val
plan_zcrit_val1 = st.norm.ppf(1-.025)
plan_zcrit_val2 = st.norm.ppf(1-.975)

plan_z2sampprop_conclusion = 'Because the z-statistic is less than the critical value, we reject the null hypthesis.'
plan_z2sampprop_conclusion

#### Python one sided proportion Z test from statsmodels

In [None]:
# Z test for proportions - https://www.statsmodels.org/stable/generated/statsmodels.stats.proportion.proportions_ztest.html
# See url for examples
z_stat, p_value = proportions_ztest(count=sample_count, nobs=n, value=0.10, alternative='two-sided')

#### Python two sided proportion Z test from statsmodels

In [None]:
significance = 0.025
# our samples - 82% are good in one, and ~79% are good in the other
# note - the samples do not need to be the same size
sample_success_a, sample_size_a = (410, 500)
sample_success_b, sample_size_b = (379, 400)
# check our sample against Ho for Ha != Ho
successes = np.array([sample_success_a, sample_success_b])
samples = np.array([sample_size_, sample_size_b])
# note, no need for a Ho value here - it's derived from the other parameters
stat, p_value = proportions_ztest(count=successes, nobs=samples,  alternative='two-sided')
# report
print('z_stat: %0.3f, p_value: %0.3f' % (stat, p_value))
if p_value > significance:
   print ("Fail to reject the null hypothesis - we have nothing else to say")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")

## Chi-Squared Test

In [None]:
#MULTIPLE VARIABLE VERSUS EACH OTHER PERCENTAGE DATA CHI-SQUARE TEST
table = np.zeros((2, 3))
for idx, value in enumerate(all_data['EVERMARRIED'].value_counts().index):
    table[0, idx] = len(all_data[(all_data['NOALC'] == 0) & (all_data['EVERMARRIED'] == value)])
    table[1, idx] = len(all_data[(all_data['NOALC'] == 1) & (all_data['EVERMARRIED'] == value)])
chisq_test = stats.contingency.chi2_contingency(table)
manual_chisq = np.divide((table - chisq_test[3])**2, chisq_test[3]).sum() #test of chi-square

## Effect Size & Power

In [None]:
# EFFECT SIZE
def Cohen_d(group1, group2):
    diff = group1.mean() - group2.mean()
    n1 = len(group1)
    n2 = len(group2)
    var1 = group1.var(ddof=1)
    var2 = group2.var(ddof=1)
    pooled_var = ((n1-1) * var1 + (n2-1) * var2) / (n1 + n2 - 2)
    d = diff / np.sqrt(pooled_var)    
    return d

# POWER
test = TTestIndPower()
test.solve_power(alpha=alpha_default,nobs1=n_default, effect_size=d, power=0.8)

## Simple Linear Regression

#### Covariance

For two random variables $X$ and $Y$, each with $n$ values:

$\sigma_{XY} = \frac{\Sigma^n_{i = 1}(x_i - \mu_x)(y_i - \mu_y)}{n}$ <br/>

#### Correlation

Pearson Correlation: A correlation of -1 means that X and Y are perfectly negatively correlated, and a correlation of 1 means that X and Y are perfectly positively correlated. <br/>$\ r_P = \frac{\Sigma^n_{i = 1}(x_i - \mu_x)(y_i - \mu_y)}{\sqrt{\Sigma^n_{i = 1}(x_i - \mu_x)^2\Sigma^n_{i = 1}(y_i -\mu_y)^2}}$

#### Numpy Covariance

In [None]:
X = [1, 3, 5]
Y = [2, 9, 10]

# Covariance by hand:
((1-3) * (2-7) + (3-3) * (9-7) + (5-3) * (10-7)) / 3

# Better yet: With NumPy:
np.cov(X, Y, ddof=0)[0, 1]

np.cov(X, Y, ddof=0)

#### Numpy Correlation

In [None]:
np.corrcoef(X, Y)
4 / np.sqrt(19)
np.corrcoef(X, Y)[0, 1] == (np.cov(X, Y, ddof=0) / (np.std(X) * np.std(Y)))[0, 1]

In [None]:
# Scipy function for Correlation
stats.pearsonr(X, Y)[0]

### Regression Equation

The solution for a simple regression best-fit line is as follows:

- slope: <br/>$ m = r_P\frac{\sigma_y}{\sigma_x} = \frac{cov(X, Y)}{var(X)}$

- y-intercept:<br/> $ b = \mu_y - m\mu_x$

#### Regression Without Error in `statsmodels`

In [None]:
#Y as a function of X. Y is target = dependent variable, X is predictor or independent variable
sm.formula.ols(formula = "y ~ x", data = test_df).fit().summary()

#### Regression with Error in `statsmodels`

In [None]:
x = np.arange(20)
y = np.array([3*pt + 5 + gauss(mu=0, sigma=5) for pt in x])

df2 = pd.DataFrame(columns=['x', 'y'])

df2['x'] = x
df2['y'] = y

model = sm.formula.ols(formula='y~x', data=df2).fit()

model.summary()

## Multiple Linear Regression

In [None]:
# Selecting features (independent variables) and target (dependent variable) from dataframe subset
model_prep = df[['gross', 'budget', 'title_year', 'genres', 'imdb_score', 'actor_1_facebook_likes', 
                  'cast_total_facebook_likes', 'content_rating', 'language']]
model_prep.dropna(subset=['title_year'], inplace=True)
model_prep['years_old'] = 2021 - model_prep['title_year'].astype(int)
model_prep.head()

In [None]:
# Checking correlation
model_prep.corr()

In [None]:
# Creating dummy variables with pd.get_dummies
summary_ols = pd.get_dummies(model_prep, columns=['content_rating']).drop(columns='content_rating_PG-13')

# Or

pd.get_dummies(comma_use.drop('RespondentID', axis=1))

In [None]:
# Creating dummy variables with OneHotEncoder:
ohe = OneHotEncoder(drop='first')
comma_trans = ohe.fit_transform(comma_use.drop('RespondentID', axis=1))

In [None]:
# Creating summary table
film_lr = ols(formula='gross~cast_total_facebook_likes+budget+years_old+content_rating_G+content_rating_PG+content_rating_R', data=summary_ols).fit()
film_lr.summary()

### Building Model from Scratch

In [None]:
data = sns.load_dataset('diamonds').drop(['cut', 'color', 'clarity'], axis = 1)
X, y = data.drop('price', axis=1), data['price']
model2 = sm.OLS(y, X).fit()
model2.summary()

#### Plotting Model

In [None]:
sm.graphics.plot_regress_exog(model2, 'carat', fig=plt.figure(figsize=(12, 8)));

In [None]:
y.hist();

#### Build model with log-scaled target

In [None]:
y_scld = np.log(y)
y_scld.hist();

In [None]:
model3 = sm.OLS(y_scld, X).fit()
model3.summary()

In [None]:
# Plotting log-scaled target
sm.graphics.plot_regress_exog(model3, 'carat', fig=plt.figure(figsize=(12, 8)));

**Remember that $R^2$ can be negative!**

In [None]:
bad_pred = np.mean(y) * np.ones(len(y))
worse_pred = (np.mean(y) + 1000) * np.ones(len(y))

print(metrics.r2_score(y, bad_pred))
print(metrics.r2_score(y, worse_pred))

### Running the Regression

First, we'll separate the data into our predictors (X) and target (y)

In [None]:
wine_preds = wine.drop('quality', axis=1)
wine_target = wine['quality']
wine_preds.head()

#### Creating constant term/y-intercept

In [None]:
# Use sm.add_constant() to add constant term/y-intercept
predictors = sm.add_constant(wine_preds)
predictors

In [None]:
# Summary on data with new constant
model = sm.OLS(wine_target, predictors).fit()
model.summary()

### Scaling Z-scores

In [None]:
# We'll include all the columns for now.
# Z score calculation: n - mean of dataset / std of dataset
wine_preds_scaled = (wine_preds - np.mean(wine_preds)) / np.std(wine_preds)

predictors = sm.add_constant(wine_preds_scaled)
model = sm.OLS(wine_target, predictors).fit()
model.summary()

### Multiple Linear Regression in Scikit Learn

In [None]:
# Let's create a StandardScaler object to scale our data for us.
ss = StandardScaler()

# Now we'll apply it to our data by using the .fit() and .transform() methods.
ss.fit(wine_preds)

wine_preds_st_scaled = ss.transform(wine_preds)

In [None]:
# Check that the scaling worked about the same as when we did it by hand
np.allclose(wine_preds_st_scaled, wine_preds_scaled)

In [None]:
# Checking the values add up to original pandas df
wine_preds_st_scaled[:5, :]

In [None]:
# Now we can fit a LinearRegression object to our training data!
lr = LinearRegression()
lr.fit(wine_preds_st_scaled, wine_target)

In [None]:
# We can use the .coef_ attribute to recover the results
# of the regression.
print(lr.coef_)
print(lr.intercept_)
print(lr.score(wine_preds_st_scaled, wine_target))
print(lr.predict(wine_preds_st_scaled))

### SKlearn Metrics

The metrics module in sklearn has a number of metrics that we can use to meaure the accuracy of our model, 
including the $R^2$ score, the mean absolute error and the mean squared error. 
Note that the default 'score' on our model object is the $R^2$ score. Let's go back to our wine dataset:

In [None]:
metrics.r2_score(wine_target, lr.predict(wine_preds_st_scaled))

Let's make sure this metric is properly calibrated. If we put simply $\bar{y}$ as our prediction, then we should get an $R^2$ score of *0*. And if we predict, say, $\bar{y} + 1$, then we should get a *negative* $R^2$ score.

In [None]:
avg_quality = np.mean(wine_target)
num = len(wine_target)

metrics.r2_score(wine_target, avg_quality * np.ones(num))

metrics.r2_score(wine_target, (avg_quality + 1) * np.ones(num))

metrics.mean_absolute_error(wine_target, lr.predict(wine_preds_st_scaled))

metrics.mean_squared_error(wine_target, lr.predict(wine_preds_st_scaled))

## Bayes Theorem

$$P(A|B) = \frac{P(B|A)}{P(B)}\ P(A) $$

$$P(A|B) =  \frac{P(B|A)*P(A)}{P(B)}$$

$$P(A|B) =  \frac{P(B|A)*P(A)}{P(B|A)*P(A) + P(B|not_A)*P(not_A)}$$

Each part (note, depending how you approach this, you might group different parts together):

- $P(A)$: *prior*
- $P(A|B)$: *posterior*
- $P(B|A)$: *likelihood*
- $\frac{1}{P(B)}$: *normalization*

An advertising executive is studying television viewing habits of married men and women during prime time hours. Based on the past viewing records he has determined that during prime time wives are watching television 60% of the time. It has also been determined that when the wife is watching television, 40% of the time the husband is also watching. When the wife is not watching the television, 30% of the time the husband is watching the television. Find the probability that if the husband is watching the television, the wife is also watching the television.

In [None]:
# Bayes Theorem from above problem
P_a = .6           #prob of wife watching tv
P_b_given_a = .4   #prob of husband watching tv given wife watching tv
P_b_not_a = .3     #prob of husband watching tv given wife not watching tv
P_not_a = .4       #prob of wife not watching tv

P_a_given_b = (P_a * P_b_given_a) / (P_a * P_b_given_a + P_not_a * P_b_not_a)

print(f'Probability that the wife is watching tv given the husband is watching tv is: {P_a_given_b}')