In [9]:
import numpy as np, pandas as pd
import scipy.stats

# First hypothesis - Is first class road more deadly than highway?

### Prepare data

1. Extract needed columns from accidents.pkl.gz
2. Create new columns for contingency table

In [10]:
df_fat = pd.read_pickle("accidents.pkl.gz")[['p13a', 'p36']]
df_fat = df_fat[df_fat['p36'].isin([0, 1])]
df_fat['fatality'] = df_fat['p13a'] > 0
df_fat['first_class'] = df_fat['p36'] == 1

### Make contingency table

In [11]:
ct = pd.crosstab(df_fat['first_class'], df_fat['fatality'])
ct

fatality,False,True
first_class,Unnamed: 1_level_1,Unnamed: 2_level_1
False,24293,166
True,78618,911


### Compare value from $\chi^2$ test with alpha

In [12]:
stat = scipy.stats.chi2_contingency(ct)
alpha = 0.05

print(f"p-value: {stat[1]}")
if stat[1] >= alpha:
    print("Hypothesis is confirmed")
else:
    print("Hypothesis is declined")

p-value: 3.6067450279444316e-10
Hypothesis is confirmed


# Second hypothesis - Does Škoda have lesser damage than Audi when involved in accident?
### Prepare data for Welch's t-test

In [13]:
df_dmg = pd.read_pickle("accidents.pkl.gz")[['p45a', 'p53']]
df_skoda = df_dmg[df_dmg['p45a'] == 39]
df_audi = df_dmg[df_dmg['p45a'] == 2]

### Calculate p-value with Welch's t-test
This test is used because it tests for a significant difference between the mean of two unrelated groups.

In [14]:
stat = scipy.stats.ttest_ind(df_skoda['p53'], df_audi['p53'], equal_var=False)

print(f"p-value: {stat[1]}")

p-value: 1.2215657690775337e-120


### Calculate degree of freedom for Welch's t-test
Taken from [https://pythonfordatascienceorg.wordpress.com/welch-t-test-python-pandas/](https://pythonfordatascienceorg.wordpress.com/welch-t-test-python-pandas/)

In [15]:
x = df_skoda['p53']
y = df_audi['p53']
dof = (x.var()/x.size + y.var()/y.size)**2 / ((x.var()/x.size)**2 / (x.size-1) + (y.var()/y.size)**2 / (y.size-1))
dof

12065.653204689757

### Compare values with alpha
p-value must be divided by 2 so one-tailed test can be performed. This test tests the hypothesis that one population is greater or equal than the other. We need to reject h0 hypothesis so p-value must be lesser than alpha.

In [16]:
alpha = 0.05

if stat[1] / 2 < alpha:
    print("Hypothesis is confirmed")
else:
    print("Hypothesis is declined")

Hypothesis is confirmed
