In [8]:
import pandas as pd
import numpy as np
from scipy import stats
from pydataset import data
import env
url = env.get_db_url('employees')

1) Answer with the type of stats test you would use (assume normal distribution):
    a) Do students get better test grades if they have a rubber duck on their desk?
        - single sample T-test with one tail
    
    b)Does smoking affect when (whether) or not someone has lung cancer?
        - discreet vs discreet; chi-squared test
    
    c)Is gender independent of a person’s blood type?
        - ANOVA 
    
    d)A farming company wants to know if a new fertilizer has improved crop yield or not
        - single sample, single tail T-test
    
    e)Does the length of time of the lecture correlate with a students grade?
        - two sample correlation test
    
    f)Do people with dogs live in apartments more than people with cats?
        - single sample t test

#### 2) Use the following contingency table to help answer the question of whether using a macbook and being a codeup student are independent of each other.

In [7]:
computer_breakdown =  pd.DataFrame(np.array([[49, 20],[1,30]]),columns=['CodeUp','not_CodeUp'], index=['Mac_le_More','Mac_le_Less'])

In [8]:
computer_breakdown

Unnamed: 0,CodeUp,not_CodeUp
Mac_le_More,49,20
Mac_le_Less,1,30


In [11]:
chi2, p, degf, expected = stats.chi2_contingency(computer_breakdown)

In [12]:
print('Observed')
print(computer_breakdown.values)
print('\nExpected')
print(expected.astype(int))
print('\n----')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

Observed
[[49 20]
 [ 1 30]]

Expected
[[34 34]
 [15 15]]

----
chi^2 = 36.6526
p     = 0.0000


### from this, we can conlude there is a strong correlation between students being in codeup and having a mac computer


#### 3) Choose another 2 categorical variables from the mpg dataset and perform a chi^2 contingency table test with them. Be sure to state your null and alternative hypotheses.

In [15]:
mpg = data('mpg')

In [16]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [18]:
mpg.nunique()

manufacturer    15
model           38
displ           35
year             2
cyl              4
trans           10
drv              3
cty             21
hwy             27
fl               5
class            7
dtype: int64

In [19]:
# drive options vs. year made will result in a 2x3 contingency table
# drive options: front, rear, all
# year options: 1999, 2008

### hypotheses: 
$H_0$: there is no correlation between the year a car was made and the type of drive it has
\
$H_A$: there is a correlation between the year a car was made and the type of drive it has

In [21]:
observed = pd.crosstab(mpg.year, mpg.drv)
observed

drv,4,f,r
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1999,49,57,11
2008,54,49,14


In [22]:
chi2, p, degf, expected = stats.chi2_contingency(observed)


In [23]:
print('Observed')
print(observed.values)
print('\nExpected')
print(expected.astype(int))
print('\n----')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

Observed
[[49 57 11]
 [54 49 14]]

Expected
[[51 53 12]
 [51 53 12]]

----
chi^2 = 1.2065
p     = 0.5470


#### #4 Use the data from the employees database to answer these questions:

a) Is an employee's gender independent of whether an employee works in sales or marketing? (only look at current employees)

In [15]:
alpha = 0.05

In [2]:
query = ''' 
        select emp_no, gender, dept_name
        from employees
            join dept_emp
                using (emp_no)
            join departments
                using (dept_no)
        where to_date = '9999-01-01'
        and dept_name in ('Marketing','Sales')
        '''

In [9]:
df = pd.read_sql(query, url)
df


Unnamed: 0,emp_no,gender,dept_name
0,10017,F,Marketing
1,10058,M,Marketing
2,10140,F,Marketing
3,10228,F,Marketing
4,10239,F,Marketing
...,...,...,...
52538,499966,F,Sales
52539,499976,M,Sales
52540,499980,M,Sales
52541,499986,F,Sales


In [10]:
df.gender.value_counts()
df.dept_name.value_counts()

Sales        37701
Marketing    14842
Name: dept_name, dtype: int64

In [11]:
observed = pd.crosstab(df.gender, df.dept_name)
observed

dept_name,Marketing,Sales
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,5864,14999
M,8978,22702


In [12]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

In [13]:
print('Observed')
print(observed.values)
print('\nExpected')
print(expected.astype(int))
print('\n----')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

Observed
[[ 5864 14999]
 [ 8978 22702]]

Expected
[[ 5893 14969]
 [ 8948 22731]]

----
chi^2 = 0.3240
p     = 0.5692


In [16]:
if p < alpha:
    print('We reject the null hypothesis')
else:
    print('We fail to reject the null hypothesis')


We fail to reject the null hypothesis


B) is an employee's gender independednt of whether or not they are or have been a manager?

In [17]:
query = '''
        select emp_no, gender, dept_no
        from employees
            left join dept_manager
                using (emp_no)
        '''

In [18]:
df = pd.read_sql(query, url)
df

Unnamed: 0,emp_no,gender,dept_no
0,10001,M,
1,10002,F,
2,10003,M,
3,10004,M,
4,10005,M,
...,...,...,...
300019,499995,F,
300020,499996,M,
300021,499997,M,
300022,499998,M,


In [19]:
df.dept_no.value_counts(dropna=False)


None    300000
d004         4
d006         4
d009         4
d001         2
d002         2
d003         2
d005         2
d007         2
d008         2
Name: dept_no, dtype: int64

In [20]:
df.dept_no = df.dept_no.fillna('not manager')

In [21]:
df.dept_no = np.where( df.dept_no != 'not manager', 
                      'manager', 'not manager')

In [22]:
observed = pd.crosstab(df.gender, df.dept_no)
observed

dept_no,manager,not manager
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,13,120038
M,11,179962


In [23]:
chi2, p, degf, expected = stats.chi2_contingency(observed)


In [24]:
print('Observed')
print(observed.values)
print('\nExpected')
print(expected.astype(int))
print('\n----')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

Observed
[[    13 120038]
 [    11 179962]]

Expected
[[     9 120041]
 [    14 179958]]

----
chi^2 = 1.4567
p     = 0.2275


In [25]:
if p < alpha:
    print('We reject the null hypothesis')
else:
    print("we fail to reject the null hypothesis")

we fail to reject the null hypothesis
