In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import norm, ttest_1samp
from statsmodels.stats.weightstats import ztest

from bioinfokit.analys import stat
## You can install bioinfokit using !pip install bioinfokit

In [14]:
mu = 1800
std = 100

n = 5

obs = 1900

z = ( obs - mu ) / (std / np.sqrt(n))

p_value = 1 - norm.cdf(z)
print(p_value)

0.0126736593387341


### Population Average and standard deviation

In [47]:
df = pd.read_csv("Sachin_ODI.csv")

mu = df["runs"].mean()
sigma = df["runs"].std()

print("Mean: ", mu, " Std: ", sigma)

Mean:  43.24166666666667  Std:  42.18245206052442


In [17]:
df.groupby(["Inns"])["runs"].mean()

Inns
1    46.670588
2    40.173684
Name: runs, dtype: float64

### India batting first - Does Sachin do better here?

H0: Sachin does not score more than the average.

Ha: Sachin scores more than his usual average.

In [34]:
first_innings_runs = df[df["Inns"]==1][["runs"]]
data = first_innings_runs

In [36]:
z = (data.mean() - mu) / (sigma / np.sqrt(len(data)))
p = 1 - norm.cdf(z)

print("p-value: ", p)

if p < 0.05:
    print("Reject H0")
else:
    print("Stick to your null hypothesis")
    print("No difference between first innings and overall mean")

p-value:  [0.14460321]
Stick to your null hypothesis
No difference between first innings and overall mean


### India Winning the match - Does Sachin score higher if India wins?

H0: Sachin's mean score when india wins is similar to his overall performance.

H1: Sachin scores higher when India wins.

In [26]:
df.groupby(["Won"])["runs"].mean()

Won
False    35.130682
True     51.000000
Name: runs, dtype: float64

In [42]:
all_victory_runs = df[df["Won"]==True][["runs"]]
data = all_victory_runs

z = (data.mean() - mu) / (sigma / np.sqrt(len(data)))
p = 1 - norm.cdf(z)

print("n: ", len(data))
print("p-value: ", p)

if p < 0.05:
    print("Reject H0")
else:
    print("Stick to your null hypothesis")
    print("No difference between first innings and overall mean")

n:  184
p-value:  [0.00630041]
Reject H0


### What to do if population standard deviation is not known?
Practically speaking, it is nearly impossible to calculate the population standard deviation exactly. Hence, we have to switch from the z-test to the t-test. However, if "n" is large, then t and z test will give us the same results.

In [59]:
all_victory_runs = df[df["Won"]==True][["runs"]]
data = all_victory_runs

In [60]:
ztest(data, value=mu, alternative="larger") # Right tailed test: z_stat, p_value

(array([2.35695059]), array([0.00921285]))

In [61]:
ttest_1samp(data, popmean=mu, alternative="greater") # t-stat, pvalue

Ttest_1sampResult(statistic=array([2.35695059]), pvalue=array([0.00974129]))

In [49]:
len(data)

184