In [88]:
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from math import sqrt
from pydataset import data
np.random.seed(123)

### Simulations

Selects random options from a list : np.random.choice

In [None]:
#dice 
#pay five dollars and roll three dice, if the sum is greater than 12, you get 15 dollars
n_trials = nrows = 10_000
n_dice = ncols = 3

rolls = np.random.choice([1, 2, 3, 4, 5, 6], n_trials * n_dice).reshape(nrows, ncols)
sum_by_trial = rolls.sum(axis=1)
wins = sums_by_trial > 12
win_rate = wins.astype(int).mean()

expected_winnings = win_rate * 15
cost = 5 
expected_profit = expected_winnings - cost

In [7]:
#dice 
#at least one three in three dice rolls 
n_sim = nrows = 10**5 
n_dice = ncols = 3
rolls = np.random.choice([1, 2, 3, 4, 5, 6], nrows * ncols).reshape(nrows, ncols)

(pd.DataFrame(rolls)
.apply(lambda row: 3 in row.values, axis=1).mean())


0.42177

In [10]:
# dice 
#roll two heads 
#data frame method 
n_trials = 10_000 
n_dice = 2
rolls = np.random.choice([x for x in range(1,7)], size=(n_trials, n_dice))
rolls = pd.DataFrame(rolls, columns = ['die1', 'die2'])
(rolls.die1 == rolls.die2).mean()

0.1665

In [12]:
#flip 8 coins, probability of exactly three heads 
n_flip = 10_000 
coins = 8 
flips = np.random.choice([x for x in range(2)], n_flip*coins)
flips = flips.reshape(n_flip, coins)
heads_per_flip = flips.sum(axis=1)
(heads_per_flip == 3).mean()

0.2249

In [13]:
#Flip 8 coins, probability of more than three heads
(heads_per_flip > 3).mean()

0.6298

Generates numbers between a given lower and upper bound: np.random.uniform

Generates numbers between 0 and 1: np.random.random

In [None]:
#30% chance my son takes a nap on any given weekend day. What is the chance that he takes a nap at least one day this weekend? What is the probability that he doesn't nap at all? 
p_nap = .3
ndays = ncols = 2
n_simulated_weekends = nrows = 10**5 
data = np.random.random((nrows, ncols))
naps = data < p_nap 
naps.sum(axis = 1)
#1 nap
(naps.sum(axis=1) >= 1).mean()
# 0 naps 
(naps.sum(axis=1) >= 0).mean()

Generates numbers from the standard normal distribution: np.random.randn

Generates numbers from a normal distribution with a specified mean and standard deviation: np.random.normal

In [14]:
#poptarts: students buy 3 poptarts with std of 1.5 per day from snack machine. If the machine is restocked on Monday, how likely is it that I will be able to buy poptarts on Friday afternoon?
days_per_week = 5
weeks_to_simulate = 10_000
simulations = np.random.normal(3, 1.5, (weeks_to_simulate, days_per_week)).round()
(simulations.sum(axis=1) < 17).mean()

0.6706

### Probability Distributions

In [None]:
#Poisson 

In [21]:
#AVG number cars waiting during noon hour at bank follows poisson distribution with mean 2 
#what is the probability that no cars drive up in the noon hour? 
mean = 2 
bank_drive = stats.poisson(mean)
bank_drive.pmf(0)

0.1353352832366127

In [22]:
#Probability 3 or more cars drive through 
bank_drive.sf(2)

0.32332358381693654

In [23]:
#Probability at least 1 car drives through 
bank_drive.sf(0)

0.8646647167633873

In [None]:
#Normal Distribution 

In [None]:
#State University Graduates: mean of 3.0 and standard deviation of 0.3

In [25]:
#Setup
mu = 3.0 
std = 0.3 
grade_distribution = stats.norm(mu, std)

In [26]:
#What GPA is required to be in the top 5% 
grade_distribution.isf(0.05)

3.4934560880854417

In [27]:
#What GPA constitutes the bottom 15% of the class? 
grade_distribution.ppf(0.15)

2.689069983151863

In [29]:
#Scholarship for students in the third decile 
third_decile = [{grade_distribution.ppf(0.2)}, {grade_distribution.ppf(0.3)}]
third_decile

[{2.7475136299281258}, {2.842679846187588}]

In [30]:
#if i have a GPA of 3.5, what percentile am I in? 
grade_distribution.cdf(3.5)

0.9522096477271853

In [None]:
#Binomial Distribution 

In [33]:
#Website has an average click-through rate of 2%. One day: 4326 visitors/97 click throughs
#What is the probability of 97 people or more clicking through?
n_visitors = 4326 
p_clickthrough = 0.02
click_distribution = stats.binom(n_visitors, p_clickthrough)
click_distribution.sf(96)

0.13975823631416445

In [34]:
#Homework 100 questions where answers are probability rounded to hundreths
#Probability that 1 of your first 60 is correct 
#setup 
answers_distribution = stats.binom(60, 0.01)
answers_distribution.sf(0)

0.45284335760923855

In [36]:
#Uniform Distribution 
#Dice 
die_distribution = stats.randint(1, 7)

In [37]:
#Probability rolling a 3 
die_distribution.pmf(3)

0.16666666666666666

In [38]:
#Probability rolling a 3 or lower? 
die_distribution.cdf(3)

0.5

In [40]:
#Given probability, what is the value? 
die_distribution.ppf(5/6)

5.0

In [41]:
#Probability roll higher than 4 
die_distribution.sf(4)

0.33333333333333337

In [None]:
# there is a 1/3 chance a dice roll will be higher than what value?
die_distribution.isf(1/3)

### Hypothesis Testing 

### T-TEST

In [43]:
df = pd.read_csv('telco_churn.csv')
df.head()

Unnamed: 0,customer_id,gender,is_senior_citizen,partner,dependents,phone_service,internet_service,contract_type,payment_type,monthly_charges,total_charges,churn,tenure
0,0002-ORFBO,Female,0,Yes,Yes,1,1,1,Mailed check,65.6,593.3,No,9.0
1,0003-MKNFE,Male,0,No,No,2,1,0,Mailed check,59.9,542.4,No,9.1
2,0004-TLHLJ,Male,0,No,No,1,2,0,Electronic check,73.9,280.85,Yes,3.8
3,0011-IGKFF,Male,1,Yes,No,1,2,0,Electronic check,98.0,1237.85,Yes,12.6
4,0013-EXCHZ,Female,1,Yes,No,1,2,0,Mailed check,83.9,267.4,Yes,3.2


### One Sample T-Test

In [44]:
#H0: Mean of monthly charges of churned customers <= Mean of monthly charges of all customers 
#HA: Mean of monthly charges of churned customers > Mean of monthly charges of all customers 
alpha = 0.05 
#comfirm sample is large enough
df.churn.value_counts()
#compute test statistic and probability (t-statistic & p-value)
churn_sample = df[df.churn == 'Yes'].monthly_charges
overall_mean = df.monthly_charges.mean()
t, p = stats.ttest_1samp(churn_sample, overall_mean)
print(t, p/2, alpha)

16.94463703366894 2.5278893572028665e-60 0.05


In [45]:
if p/2 > alpha:
    print("We fail to reject $H_{0}$")
elif t < 0:
    print("We fail to reject $H_{0}$")
else:
    print("We reject $H_{0}$")

We reject $H_{0}$


In [47]:
#H0: Mean of monthly charges of churned customers = mean of monthly charges of all customers
#HA: mean of monthly charges of churned customers != mean of monthly charges of all customers
#set alpha
alpha = 0.05 
#check if sample is large enough 
df.monthly_charges.value_counts()
#compute statistics 
t, p = stats.ttest_1samp(churn_sample, overall_mean)
print(t, p, alpha)

16.94463703366894 5.055778714405733e-60 0.05


In [90]:
if p > alpha:
    print("We fail to reject $H_{0}$")
else:
    print("We reject $H_{0}$")

We reject $H_{0}$


In [52]:
#H0: Mean of monthly charges of churned customers >= Mean of monthly charges of all customers
#HA: Mean of monthly charges of churned customers < mean of monthly charges of all customers 
#set alpha
alpha = 0.05 
#check to see if sample size is large enough 
df.monthly_charges.value_counts()
#compute test statistics 
t, p = stats.ttest_1samp(churn_sample, overall_mean)
print(t, p/2, alpha)

16.94463703366894 2.5278893572028665e-60 0.05


In [53]:
if p/2 > alpha:
    print("We fail to reject $H_{0}$")
elif t < 0:
    print("We fail to reject $H_{0}$")
else:
    print("We reject $H_{0}$")

We reject $H_{0}$


### Independent T-Test 
### Two Sample T-Test 
#Compare mean of group a to mean of group b 

In [57]:
no_churn_sample = df[df.churn == 'No'].monthly_charges

In [60]:
#H0:Mean of monthly charges of churned customers <= mean of monthly charges of customers who haven't churned 
#HA: Mean of monthly charges of churned customers > mean of monthly charges of customers who haven't churned
#set alpha 
alpha = 0.05 
#I really don't understand how to test if variances are equal 
print(churn_sample.var())
print(no_churn_sample.var())
#compute the test statistic 
t, p = stats.ttest_ind(churn_sample, no_churn_sample, equal_var=False)
print(t, p/2, alpha)


609.121189260892
967.3313950133302
18.38622417075037 6.175772779771316e-73 0.05


In [61]:
if p / 2 > alpha:
    print("We fail to reject $H_{0}$")
elif t < 0:
    print("We fail to reject $H_{0}$")
else:
    print("We reject $H_{0}$")

We reject $H_{0}$


In [62]:
#H0: charges of customers who churn equals that of those who don't churn 
#HA: charges of customers who churn is not equal to that of those who don't churn 
#set alpha 
alpha = 0.05
#compute statistics 
t, p = stats.ttest_ind(churn_sample, no_churn_sample, equal_var=False)
print(t, p, alpha)

18.38622417075037 1.2351545559542633e-72 0.05


In [63]:
if p  > alpha:
    print("We fail to reject $H_{0}$")
else:
    print("We reject $H_{0}$")

We reject $H_{0}$


In [65]:
#H0: Charges of customers who churn is equal or greater than that of those who don't churn 
#HA: Charges of customers who churn is less than that of those who don't churn 
#Set Alpha 
alpha = 0.05 
#Check for variances 
#compute statistics 
t, p = stats.ttest_ind(churn_sample, no_churn_sample, equal_var=False)
print(t, p/2, alpha)

18.38622417075037 6.175772779771316e-73 0.05


In [66]:
if p / 2 > alpha:
    print("We fail to reject $H_{0}$")
elif t < 0:
    print("We fail to reject $H_{0}$")
else:
    print("We reject $H_{0}$")

We reject $H_{0}$


### ANOVA Analysis of Variance 
#Compare means of groups a, b, & c 

In [74]:
df = sns.load_dataset('iris')
df.species.value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [76]:
versicolor_sepal_length = df[df.species == 'versicolor'].sepal_length
virginica_sepal_length = df[df.species == 'virginica'].sepal_length
setosa_sepal_length = df[df.species == 'setosa'].sepal_length

In [77]:
#H0: population means of the sepal length for the three species, versicolor, virginica, & setosa, are all equal.
#HA: population means of the sepal length for the three species, versicolor, virginica, & setosa, are NOT all equal.
#Set significance 
alpha = 0.05 
#check for variance 
print(versicolor_sepal_length.var())
print(virginica_sepal_length.var())
print(setosa_sepal_length.var())
#compute test statistic 
f, p = stats.f_oneway(versicolor_sepal_length, virginica_sepal_length, setosa_sepal_length)
print(f, p, alpha)
    

0.2664326530612246
0.40434285714285706
0.12424897959183666
119.26450218450472 1.6696691907693648e-31 0.05


In [78]:
if p < alpha:
    print("We reject $H_{0}$")
else:
    print("We fail to reject $H_{0}$")

We reject $H_{0}$


In [83]:
#import dataset 
df = sns.load_dataset('mpg')
#check sample size 
df.origin.value_counts()
#drop null values 
df.dropna(inplace=True)
#subset dataframe 
usa_hp = df[df.origin == 'usa'].horsepower
japan_hp = df[df.origin == 'japan'].horsepower
eu_hp = df[df.origin == 'europe'].horsepower
#set hypothesis 
#H0: HP is the same across all origins 
#HA: HP is not the same across all origins
#set alpha
alpha = 0.05 
#check variances 
print(usa_hp.var())
print(japan_hp.var())
print(eu_hp.var())


1591.8336567413864
317.5238558909445
406.3397717295875


Looking at the variances, they are very different, so I will move to a 2-sample, independent t-test comparing usa made cars vs. non-usa made cars.

In [84]:
#H0: usa origin cars' hp equals non-usa origin cars' hp 
#HA: usa origin cars' hp does not equal non-usa origin cars' hp
#set alpha 
alpha = 0.05 
#variance check 
usa_hp = df[df.origin == 'usa'].horsepower
non_usa_hp = df[df.origin != 'usa'].horsepower
print(usa_hp.var())
print(non_usa_hp.var())
#set equal variance to false because variance values are far off 
t, p = stats.ttest_ind(usa_hp, non_usa_hp, equal_var=False)
print(t, p, alpha)

1591.8336567413864
356.2380020501352
13.017135027378012 3.384733024288272e-32 0.05


In [91]:
if p > alpha:
    print("We fail to reject $H_{0}$")
else:
    print("We reject $H_{0}$")

We reject $H_{0}$


In [None]:
#Kruskal-Wallis Test, 

In [86]:
stats.kruskal(usa_hp, japan_hp, eu_hp)

KruskalResult(statistic=105.59475799843663, pvalue=1.1759521262123952e-23)

In [87]:
#Correlation 
#Correlation tests are used to check if two samples are related. They are often used for feature selection and multivariate analysis in data preprocessing and exploration


# Chi Square 

In [92]:
#load dataset 
mpg = data('mpg')
mpg['transmission'] = mpg.trans.str[:-4]
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class,transmission
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,auto
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,manual
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,manual
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,auto
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,auto


In [None]:
#is the drive independent of transmission type? 
#H0: drive is independent of transmission type 
#HA: drive is not independent of transmission type 

In [93]:
#first calculate the proportions for the transmission type: 
n = mpg.shape[0]
transmission_proportions = mpg.transmission.value_counts() / n
transmission_proportions

auto      0.67094
manual    0.32906
Name: transmission, dtype: float64

In [None]:
Automatic transmissions: 67%
Manual transmissions 33%

In [94]:
#Second calculate the proportions for the drive types: 
drive_proportions = mpg.drv.value_counts() / n
drive_proportions

f    0.452991
4    0.440171
r    0.106838
Name: drv, dtype: float64

In [None]:
#To find the overall proportions, we multiply all the combinations of proportions together 
#To find the proportion of automatic drive cars with 4 wheel drive, we multiply those two proportions together
.67 * .44 = .2984
#Therefore we would expect 29.84% of the total cars to be automatic and 4-wheel drive 

In [95]:
#to do this in a dataframe you would write this code: 
expected = pd.DataFrame()

for transmission_group, t_prop in transmission_proportions.iteritems():
    for drive_group, d_prop in drive_proportions.iteritems(): 
        expected.loc[drive_group, transmission_group] = t_prop * d_prop

expected.sort_index(inplace=True)
expected

Unnamed: 0,auto,manual
4,0.295328,0.144843
f,0.30393,0.149061
r,0.071682,0.035156


In [96]:
#if we want to convert these proportions to expected numbers of values, we can multiply by the total number of observations
expected *= n 
expected

Unnamed: 0,auto,manual
4,69.106838,33.893162
f,71.119658,34.880342
r,16.773504,8.226496


In [None]:
#easy way 

In [None]:
#data.isnull().sum()
#dropna(inplace=True)