# Principais Funções para Testes de Hipóteses

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from scipy import stats as st

### Trazendo os Dados

In [3]:
dados_ab = pd.read_csv(r"..\13-Teste-Hipoteses\Dados\exampleDataABtest.csv")
dados_ab.head()

Unnamed: 0,group,time,clickedTrue
0,A,2016-06-02 02:17:53,0
1,A,2016-06-02 03:03:54,0
2,A,2016-06-02 03:18:56,1
3,B,2016-06-02 03:23:43,0
4,A,2016-06-02 04:04:00,0


In [4]:
dados_nps = pd.read_csv(r"..\13-Teste-Hipoteses\Dados\nps_example.csv", sep=";")
dados_nps.head()

Unnamed: 0,id,response_status,how_long_listening,age,nps_score,gender
0,11706300,Complete,Less than 6 months,25-34,10.0,Female
1,11706302,Complete,1 year to less than 3 years,25-34,10.0,Female
2,11706307,Complete,6 months to less than a year,35-44,10.0,Female
3,11706312,Complete,Less than 6 months,35-44,10.0,Female
4,11706316,Complete,6 months to less than a year,25-34,10.0,Male


# Exemplo Teste t

Comparação do número de cliques quanto aos sites A e B

- Comparar os Grupos

In [5]:
dados_ab.groupby('group') \
    .agg(media_cliques = pd.NamedAgg('clickedTrue', 'mean'),
         dp_cliques = pd.NamedAgg('clickedTrue', 'std'),
         n = pd.NamedAgg('clickedTrue', 'count'))

Unnamed: 0_level_0,media_cliques,dp_cliques,n
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.04,0.196155,500
B,0.08,0.271565,500


In [7]:
grA = dados_ab[dados_ab['group'] == 'A']['clickedTrue']
grB = dados_ab[dados_ab['group'] == 'B']['clickedTrue']


In [9]:
st.ttest_ind(grA, grB, alternative='two-sided', )

TtestResult(statistic=np.float64(-2.669938469060931), pvalue=np.float64(0.007709783987515948), df=np.float64(998.0))

- se o p-valor for menor que 0.05, rejeito a hipótese nula
    - p_value -> 0.007


In [None]:
# Pingouin
pg.ttest(grA, grB, alternative='two-sided', confidence=0.95)

NameError: name 'pg' is not defined

# Exemplo Teste F

Comparar os valores de NPS quanto as faixas etárias

- Verificar questionários respondidos completamente

In [12]:
dados_nps.groupby('response_status') \
    .size() \
    .to_frame('n') \
    .reset_index()

Unnamed: 0,response_status,n
0,Complete,2281
1,Incomplete,265
2,Terminated,33


In [15]:
dados_nps[dados_nps['nps_score'].isnull()]

Unnamed: 0,id,response_status,how_long_listening,age,nps_score,gender
17,11706467,Incomplete,Less than 6 months,18-24,,
31,11706938,Incomplete,1 year to less than 3 years,25-34,,
32,11706979,Incomplete,6 months to less than a year,25-34,,
43,11707426,Incomplete,6 months to less than a year,25-34,,
48,11707719,Incomplete,3 years to less than 5 years,35-44,,
...,...,...,...,...,...,...
2546,13093216,Incomplete,6 months to less than a year,35-44,,
2556,13278063,Incomplete,3 years to less than 5 years,18-24,,
2570,13565327,Complete,1 year to less than 3 years,45-54,,Female
2572,13601847,Incomplete,3 years to less than 5 years,25-34,,


In [17]:
dados_nps_filtrados = dados_nps[(dados_nps['response_status'] == 'Complete') & (dados_nps['nps_score'].notna())]
dados_nps_filtrados.head()

Unnamed: 0,id,response_status,how_long_listening,age,nps_score,gender
0,11706300,Complete,Less than 6 months,25-34,10.0,Female
1,11706302,Complete,1 year to less than 3 years,25-34,10.0,Female
2,11706307,Complete,6 months to less than a year,35-44,10.0,Female
3,11706312,Complete,Less than 6 months,35-44,10.0,Female
4,11706316,Complete,6 months to less than a year,25-34,10.0,Male


In [18]:
dados_nps_filtrados.groupby('age') \
    .agg(media_nps = pd.NamedAgg('nps_score', 'mean'),
         dp_nps = pd.NamedAgg('nps_score', 'std'),
         n = pd.NamedAgg('nps_score', 'size')) \
    .reset_index()

Unnamed: 0,age,media_nps,dp_nps,n
0,18-24,9.464539,1.116275,282
1,25-34,9.694828,0.957639,580
2,35-44,9.707612,0.979501,578
3,45-54,9.719039,0.928254,541
4,55-64,9.733871,0.92302,248
5,65-74,9.423077,1.36156,26
6,75+,8.0,0.0,2


In [19]:
dados_nps_filtrados_aj = dados_nps_filtrados[dados_nps_filtrados['age'] != '75+']

In [20]:
dados_nps_filtrados_aj.groupby('age') \
    .agg(media_nps = pd.NamedAgg('nps_score', 'mean'),
            dp_nps = pd.NamedAgg('nps_score', 'std'),
            n = pd.NamedAgg('nps_score', 'size')) \
    .reset_index()

Unnamed: 0,age,media_nps,dp_nps,n
0,18-24,9.464539,1.116275,282
1,25-34,9.694828,0.957639,580
2,35-44,9.707612,0.979501,578
3,45-54,9.719039,0.928254,541
4,55-64,9.733871,0.92302,248
5,65-74,9.423077,1.36156,26


- teste scipy

In [21]:
dados_18_24 = dados_nps_filtrados_aj[dados_nps_filtrados_aj['age'] == '18-24']['nps_score']
dados_25_34 = dados_nps_filtrados_aj[dados_nps_filtrados_aj['age'] == '25-34']['nps_score']
dados_35_44 = dados_nps_filtrados_aj[dados_nps_filtrados_aj['age'] == '35-44']['nps_score']
dados_45_54 = dados_nps_filtrados_aj[dados_nps_filtrados_aj['age'] == '45-54']['nps_score']
dados_55_64 = dados_nps_filtrados_aj[dados_nps_filtrados_aj['age'] == '55-64']['nps_score']
dados_65_74 = dados_nps_filtrados_aj[dados_nps_filtrados_aj['age'] == '65-74']['nps_score']

In [22]:
st.f_oneway(dados_18_24, dados_25_34, dados_35_44, dados_45_54, dados_55_64, dados_65_74)

F_onewayResult(statistic=np.float64(3.522166098104082), pvalue=np.float64(0.0035606861304276695))

- se o p-valor for menor que 0.05, rejeito a hipótese nula
    - p_value -> 0.007
