In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib
import matplotlib.pyplot as plt
from sklearn import datasets
import seaborn as sns

import scipy.stats as stats
from scipy.stats import uniform, binom, norm, kstest, shapiro 

import statsmodels.api as sm
warnings.filterwarnings('ignore')


### Experiments (t-tests) were conducted at gas stations with __standard equipment__ and with __special equipment__ (audio, video, etc.)

# T-test based on average check

In [4]:
dfpilot = pd.read_sql('''
select org_ksss, round(sum(volume)/count(distinct cheque_id),2) as average_check
from 
(select * from pet.pet_0822
union all
select * from pet.pet_0922) as pet 
where org_ksss in ('159202','159203','3179','157202','66202','1274','1273','3374','136813','8729','134201','136603','136602','3359') --pilot
and volume is not null and discount_card_info is not null
group by org_ksss
''', connection)

In [5]:
dfcontr = pd.read_sql('''
select org_ksss, round(sum(volume)/count(distinct cheque_id),2) as average_check
from 
(select * from pet.pet_0822
union all
select * from pet.pet_0922) as pet 
where org_ksss in ('3748','136922','125638','118601','133719','136802','133696','3375','3363','104801','62403','63604','153810',
'3171','136938','125622','125609','94005','3357','64404','133676','9753','133642','6416','62405','158401','5099','3217','136811',
'136921','7330','9748','127001','3355','125627','127801','3364','5063','62201','92402','3177','5037','60604','5088')     --kontr
and volume is not null and discount_card_info is not null
group by org_ksss
''', connection)

In [7]:
dfpilot['label']=1
dfcontr['label']=0
data = pd.concat([dfpilot,dfcontr], axis = 0, ignore_index = False)
data = data.reset_index(drop = True)

In [3]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
groups = sns.histplot( data = data,
                    x = 'average_check',
                    hue = 'label',
                    palette = ['r', 'b'],
                    alpha=0.5,
                    kde=True)
plt.show()

In [15]:
kstest(dfpilot['average_check'], 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [16]:
kstest(dfcontr['average_check'], 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [17]:
res = sm.stats.ttest_ind(dfcontr['average_check'], dfpilot['average_check'])
print("Two Sample t-test")
print("t =", res[0])
print("p-value =", res[1])
print("df = ", res[2])

if res[1] > 0.05:
    print('There is a statistical significance. There is a difference between average checks in 2 expirements')
else: 
    print('There is no statistical significance. There is no difference between average checks in 2 expirements')

Two Sample t-test
t = -1.8649365672733214
p-value = 0.06743332952540156
df =  56.0


# T-test based on conversion

In [18]:
dfpilotcof = pd.read_sql('''
select org_ksss, round(sum(coffee)/sum(fuel),3) as conv
from
(
select 
    org_ksss, 
    count (distinct case when volume is not null then cheque_id else null end) as fuel,
    count(distinct case when quantity is not null and gds_ksss in ('1784732', '1684653', '1745331', '3023264', '3023268', '3023267', '3180431','3387693','3387696',
'3387702',
'3388224',
'3180439',
'3180436',
'3388235') 
            then cheque_id else null end) as coffee
from 
(select * from pet.pet_0822
union all 
select * from pet.pet_0922) pet
where discount_card_info is not null 
and org_ksss in ('159202','159203','3179','157202','66202','1274','1273','3374','136813','8729','134201','136603','136602','3359') --pilot
group by org_ksss
) as a
group by org_ksss
''', connection)

In [20]:
dfcontrcof = pd.read_sql('''
select org_ksss, round(sum(coffee)/sum(fuel),3) as conv
from
(
select 
    org_ksss, 
    count (distinct case when volume is not null then cheque_id else null end) as fuel,
    count(distinct case when quantity is not null and gds_ksss in ('1784732', '1684653', '1745331', '3023264', '3023268', '3023267', '3180431','3387693','3387696',
'3387702',
'3388224',
'3180439',
'3180436',
'3388235') then cheque_id else null end) as coffee
from 
(select * from pet.pet_0822
union all 
select * from pet.pet_0922) pet
where discount_card_info is not null 
and org_ksss in ('3748','136922','125638','118601','133719','136802','133696','3375','3363','104801','62403','63604','153810',
'3171','136938','125622','125609','94005','3357','64404','133676','9753','133642','6416','62405','158401','5099','3217','136811',
'136921','7330','9748','127001','3355','125627','127801','3364','5063','62201','92402','3177','5037','60604','5088')     --kontr
group by org_ksss
) as a
group by org_ksss
''', connection)

In [21]:
dfpilotcof['label']=1
dfcontrcof['label']=0
data2 = pd.concat([dfpilotcof,dfcontrcof], axis = 0, ignore_index = True)

In [4]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
groups = sns.histplot( data = data2,
                    x = 'conv',
                    hue = 'label',
                    palette = ['r', 'b'],
                    alpha=0.5,
                    kde=True)
plt.show()

In [24]:
kstest(dfpilotcof['conv'], 'norm')

KstestResult(statistic=0.5087760222306617, pvalue=0.0006946833309473491)

In [25]:
kstest(dfcontrcof['conv'], 'norm')

KstestResult(statistic=0.5079783137169019, pvalue=4.761441751187536e-11)

In [26]:
shapiro(dfpilotcof['conv'])

ShapiroResult(statistic=0.911939799785614, pvalue=0.16814209520816803)

In [27]:
shapiro(dfcontrcof['conv'])

ShapiroResult(statistic=0.924034059047699, pvalue=0.00650955131277442)

In [28]:
res = sm.stats.ttest_ind(dfcontrcof['conv'], dfpilotcof['conv'])
print("Two Sample t-test")
print("t =", res[0])
print("p-value =", res[1])
print("df = ", res[2])

if res[1] > 0.05:
    print('There is a statistical significance. There is a difference between conversion in 2 expirements')
else: 
    print('There is no statistical significance. There is no difference between conversion in 2 expirements')

Two Sample t-test
t = -0.17165726001689416
p-value = 0.8643257963787294
df =  56.0


# T-test based on number of checks

In [29]:
dfpilotch = pd.read_sql('''
select org_ksss, count(distinct cheque_id) as chk
from 
(select * from pet.pet_0822
union all 
select * from pet.pet_0922) pet
where org_ksss in ('159202','159203','3179','157202','66202','1274','1273','3374','136813','8729','134201','136603','136602','3359') --pilot
group by org_ksss
''', connection)                    

In [30]:
dfcontrch = pd.read_sql('''
select org_ksss, count(distinct cheque_id) as chk
from 
(select * from pet.pet_0822
union all 
select * from pet.pet_0922) pet
where org_ksss in ('3748','136922','125638','118601','133719','136802','133696','3375','3363','104801','62403','63604','153810',
'3171','136938','125622','125609','94005','3357','64404','133676','9753','133642','6416','62405','158401','5099','3217','136811',
'136921','7330','9748','127001','3355','125627','127801','3364','5063','62201','92402','3177','5037','60604','5088')     --kontr
group by org_ksss
''', connection)     

In [31]:
dfpilotch['label']=1
dfcontrch['label']=0
data3 = pd.concat([dfpilotch,dfcontrch], axis=0, ignore_index=True)

In [5]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
groups = sns.histplot( data = data3,
                    x = 'chk',
                    hue = 'label',
                    palette = ['r', 'b'],
                    alpha=0.5,
                    kde=True)
plt.show()

In [34]:
kstest(dfpilotch['chk'], 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [35]:
kstest(dfcontrch['chk'], 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [36]:
res = sm.stats.ttest_ind(dfcontrch['chk'], dfpilotch['chk'])
print("Two Sample t-test")
print("t =", res[0])
print("p-value =", res[1])
print("df = ", res[2])

if res[1] > 0.05:
    print('There is a statistical significance. There is a difference between amount of checks sold in 2 expirements')
else: 
    print('There is no statistical significance. There is no difference between amount of checks sold in 2 expirements')

Two Sample t-test
t = -1.8395266861062054
p-value = 0.07114071462802875
df =  56.0


# T-test based on number of checks with coffee

In [38]:
dfpilot_ch_cof = pd.read_sql('''
select 
    org_ksss, 
    count(distinct case when quantity is not null and gds_ksss in ('1784732', '1684653', '1745331', '3023264', '3023268', '3023267', '3180431','3387693','3387696',
'3387702',
'3388224',
'3180439',
'3180436',
'3388235')
            then cheque_id else null end) as coffee
from
(select * from pet.pet_0822
union all 
select * from pet.pet_0922) pet
where org_ksss in ('159202','159203','3179','157202','66202','1274','1273','3374','136813','8729','134201','136603','136602','3359') --pilot
group by org_ksss
''', connection)

In [39]:
dfcontr_ch_cof = pd.read_sql('''
select 
    org_ksss, 
    count(distinct case when quantity is not null and gds_ksss in ('1784732', '1684653', '1745331', '3023264', '3023268', '3023267', '3180431','3387693','3387696',
'3387702',
'3388224',
'3180439',
'3180436',
'3388235') then cheque_id else null end) as coffee
from
(select * from pet.pet_0822
union all 
select * from pet.pet_0922) pet
where org_ksss in ('3748','136922','125638','118601','133719','136802','133696','3375','3363','104801','62403','63604','153810',
'3171','136938','125622','125609','94005','3357','64404','133676','9753','133642','6416','62405','158401','5099','3217','136811',
'136921','7330','9748','127001','3355','125627','127801','3364','5063','62201','92402','3177','5037','60604','5088')     --kontr
group by org_ksss
''',connection)

In [40]:
dfpilot_ch_cof['label']=1
dfcontr_ch_cof['label']=0
data_coffee = pd.concat([dfpilot_ch_cof,dfcontr_ch_cof], axis=0, ignore_index=True)

In [6]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
groups = sns.histplot( data = data_coffee,
                    x = 'coffee',
                    hue = 'label',
                    palette = ['r', 'b'],
                    alpha=0.5,
                    kde=True)
plt.show()

In [46]:
kstest(dfpilot_ch_cof['coffee'], 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [47]:
kstest(dfcontr_ch_cof['coffee'], 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [48]:
res = sm.stats.ttest_ind(dfcontrch['chk'], dfpilotch['chk'])
print("Two Sample t-test")
print("t =", res[0])
print("p-value =", res[1])
print("df = ", res[2])

if res[1] > 0.05:
    print('There is a statistical significance. There is a difference between amount of checks sold with coffee in 2 expirements')
else: 
    print('There is no statistical significance. There is no difference between amount of checks sold with coffee in 2 expirements')

Two Sample t-test
t = -1.8395266861062054
p-value = 0.07114071462802875
df =  56.0


# T-test based on AZS rating and amount of rates

## 08, 09

In [68]:
dfpilot_azsrate = pd.read_sql('''

select  station_name, a.amount as amount, round(sum(a.sum)/sum(a.amount),3) as rate
from
(
select 
    station_name,
    count (rate) as amount,
    sum (rate) as sum
from 
public.azs_rating
where  date_created >= '2022-08-01' and  date_created < '2022-10-01'
and station_name in ('АЗС №50624','АЗС №36712','АЗС № 36712','АЗС № 23284','АЗС №23284','АЗС №23043','АЗС № 23043','АЗС №01159','АЗС № 01159','АЗС №23006','АЗС № 23006','АЗС №77511','АЗС № 77511','АЗС №77624','АЗС № 77624','АЗС №50623','АЗС № 26078','АЗС №26078','АЗС № 23373','АЗС №23373','АЗС № 23044','АЗС №23044','АЗС №77621','АЗС № 77621','АЗС №23374','АЗС № 23374')
group by station_name
) as a
group by station_name, amount, sum
''', connection)

In [71]:
dfcontr_azsrate = pd.read_sql('''

select  station_name, a.amount as amount, round(sum(a.sum)/sum(a.amount),3) as rate
from
(
select 
    station_name,
    count (rate) as amount,
    sum (rate) as sum
from 
public.azs_rating
where  date_created >= '2022-08-01' and  date_created < '2022-10-01'
and station_name in ('АЗС № 50407','АЗС №50407','АЗС №77617','АЗС № 77617','АЗС №34450','АЗС № 34450','АЗС №50264','АЗС № 50264','АЗС №26077','АЗС № 26077','АЗС №61315','АЗС № 61315','АЗС № 34510','АЗС №34510','АЗС № 50244','АЗС №50244',
'АЗС №77607','АЗС № 77607','АЗС №61361','АЗС № 61361','АЗС № 77571','АЗС №77571','АЗС №69472','АЗС № 61326','АЗС №61326','АЗС № 34488',
'АЗС №34488','АЗС № 61297','АЗС №61297','АЗС №77620','АЗС № 77620','АЗС №36720','АЗС № 36720','АЗС №50427','АЗС № 50427','АЗС № 23275',
'АЗС №23275','АЗС № 77516','АЗС №77516','АЗС №26099','АЗС № 26099','АЗС № 77618','АЗС №77618','АЗС №23282','АЗС № 23282','АЗС №23170','АЗС № 23170',
'АЗС № 36722','АЗС №36722','АЗС №23017','АЗС № 23017','АЗС №77596','АЗС № 77596','АЗС №08546','АЗС № 08546','АЗС № 23223','АЗС №23223',
'АЗC №36730','АЗC № 36730','АЗС № 26359','АЗС №26359','АЗС № 77544','АЗС №77544','АЗС №36716','АЗС № 36716','АЗС № 61310',
'АЗС №61310','АЗС № 69471','АЗС №69471','АЗС № 23010','АЗС №23010','АЗС №23045','АЗС № 23045','АЗС №77502','АЗС № 77502','АЗС №36703','АЗС № 36703','АЗС № 77619','АЗС №77619','АЗС №23261','АЗС № 23070','АЗС №23070','АЗС №50488','АЗС № 23229','АЗС №23229')
 group by station_name
) as a
group by station_name, amount, sum

''', connection)

In [54]:
dfpilot_azsrate['label']=1
dfcontr_azsrate['label']=0
dataazs=pd.concat([dfpilot_azsrate,dfcontr_azsrate], axis=0, ignore_index=True)

### AZS rating 08, 09

In [7]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
groups = sns.histplot( data = dataazs,
                    x = 'rate',
                    hue = 'label',
                    palette = ['r', 'b'],
                    alpha=0.5,
                    kde=True)
plt.show()

In [58]:
kstest(dfpilot_azsrate['rate'], 'norm')

KstestResult(statistic=0.9999992929251458, pvalue=1.5615125997642016e-86)

In [59]:
kstest(dfcontr_azsrate['rate'], 'norm')

KstestResult(statistic=0.9999993173347475, pvalue=1.0143156022306871e-271)

In [60]:
res = sm.stats.ttest_ind(dfcontr_azsrate['rate'], dfpilot_azsrate['rate'])
print("Two Sample t-test")
print("t =", res[0])
print("p-value =", res[1])
print("df = ", res[2])

if res[1] > 0.05:
    print('There is a statistical significance. There is a difference between AZS rating in 2 expirements')
else: 
    print('There is no statistical significance. There is no difference between AZS rating in 2 expirements')

Two Sample t-test
t = 1.030744823966293
p-value = 0.3070918118671506
df =  56.0


### Amount of rates 08,09 

In [8]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
groups = sns.histplot( data = dataazs,
                    x = 'amount',
                    hue = 'label',
                    palette = ['r', 'b'],
                    alpha=0.5,
                    kde=True)
plt.show()

In [62]:
kstest(dfpilot_azsrate['amount'], 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [63]:
kstest(dfcontr_azsrate['amount'], 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [65]:
res = sm.stats.ttest_ind(dfcontr_azsrate['amount'], dfpilot_azsrate['amount'])
print("Two Sample t-test")
print("t =", res[0])
print("p-value =", res[1])
print("df = ", res[2])

if res[1] > 0.05:
    print('There is a statistical significance. There is a difference between amount of rates in 2 expirements')
else: 
    print('There is no statistical significance. There is no difference between amount of rates in 2 expirements')

Two Sample t-test
t = -1.0888557681261295
p-value = 0.28088128182315436
df =  56.0


##  06, 07 

In [75]:
dfpilot_azsrate_67 = pd.read_sql('''

select  station_name, a.amount as amount, round(sum(a.sum)/sum(a.amount),3) as rate
from
(
select 
    station_name,
    count (rate) as amount,
    sum (rate) as sum
from 
public.azs_rating
where  date_created >= '2022-06-01' and  date_created < '2022-08-01'
and station_name in ('АЗС №50624','АЗС №36712','АЗС № 36712','АЗС № 23284','АЗС №23284','АЗС №23043','АЗС № 23043','АЗС №01159','АЗС № 01159','АЗС №23006','АЗС № 23006','АЗС №77511','АЗС № 77511','АЗС №77624','АЗС № 77624','АЗС №50623','АЗС № 26078','АЗС №26078','АЗС № 23373','АЗС №23373','АЗС № 23044','АЗС №23044','АЗС №77621','АЗС № 77621','АЗС №23374','АЗС № 23374')
group by station_name
) as a
group by station_name, amount, sum
''', connection)

In [76]:
dfcontr_azsrate_67 = pd.read_sql('''

select  station_name, a.amount as amount, round(sum(a.sum)/sum(a.amount),3) as rate
from
(
select 
    station_name,
    count (rate) as amount,
    sum (rate) as sum
from 
public.azs_rating
where  date_created >= '2022-06-01' and  date_created < '2022-08-01'
and station_name in ('АЗС № 50407','АЗС №50407','АЗС №77617','АЗС № 77617','АЗС №34450','АЗС № 34450','АЗС №50264','АЗС № 50264','АЗС №26077','АЗС № 26077','АЗС №61315','АЗС № 61315','АЗС № 34510','АЗС №34510','АЗС № 50244','АЗС №50244',
'АЗС №77607','АЗС № 77607','АЗС №61361','АЗС № 61361','АЗС № 77571','АЗС №77571','АЗС №69472','АЗС № 61326','АЗС №61326','АЗС № 34488',
'АЗС №34488','АЗС № 61297','АЗС №61297','АЗС №77620','АЗС № 77620','АЗС №36720','АЗС № 36720','АЗС №50427','АЗС № 50427','АЗС № 23275',
'АЗС №23275','АЗС № 77516','АЗС №77516','АЗС №26099','АЗС № 26099','АЗС № 77618','АЗС №77618','АЗС №23282','АЗС № 23282','АЗС №23170','АЗС № 23170',
'АЗС № 36722','АЗС №36722','АЗС №23017','АЗС № 23017','АЗС №77596','АЗС № 77596','АЗС №08546','АЗС № 08546','АЗС № 23223','АЗС №23223',
'АЗC №36730','АЗC № 36730','АЗС № 26359','АЗС №26359','АЗС № 77544','АЗС №77544','АЗС №36716','АЗС № 36716','АЗС № 61310',
'АЗС №61310','АЗС № 69471','АЗС №69471','АЗС № 23010','АЗС №23010','АЗС №23045','АЗС № 23045','АЗС №77502','АЗС № 77502','АЗС №36703','АЗС № 36703','АЗС № 77619','АЗС №77619','АЗС №23261','АЗС № 23070','АЗС №23070','АЗС №50488','АЗС № 23229','АЗС №23229')
 group by station_name
) as a
group by station_name, amount, sum

''', connection)

In [81]:
dfpilot_azsrate_67['label']=1
dfcontr_azsrate_67['label']=0
dataazs_67=pd.concat([dfpilot_azsrate_67,dfcontr_azsrate_67], axis=0, ignore_index=True)

### Azs rating 06, 07 

In [9]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
groups = sns.histplot( data = dataazs_67,
                    x = 'rate',
                    hue = 'label',
                    palette = ['r', 'b'],
                    alpha=0.5,
                    kde=True)
plt.show()

In [85]:
kstest(dfpilot_azsrate_67['rate'], 'norm')

KstestResult(statistic=0.9999994448258, pvalue=5.284937443405945e-88)

In [86]:
kstest(dfcontr_azsrate_67['rate'], 'norm')

KstestResult(statistic=0.9999989677202402, pvalue=8.093122746754876e-264)

In [87]:
res = sm.stats.ttest_ind(dfcontr_azsrate_67['rate'], dfpilot_azsrate_67['rate'])
print("Two Sample t-test")
print("t =", res[0])
print("p-value =", res[1])
print("df = ", res[2])

if res[1] < 0.05:
    print('There is a statistical significance. There is a difference between AZS rating in 2 expirements')
else: 
    print('There is no statistical significance. There is no difference between AZS rating in 2 expirements')

Two Sample t-test
t = 0.4364251518317759
p-value = 0.6642063391026083
df =  56.0


### Amount of rates 06,07

In [10]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
groups = sns.histplot( data = dataazs_67,
                    x = 'amount',
                    hue = 'label',
                    palette = ['r', 'b'],
                    alpha=0.5,
                    kde=True)
plt.show()

In [89]:
kstest(dfpilot_azsrate_67['amount'], 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [90]:
kstest(dfcontr_azsrate_67['amount'], 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [91]:
res = sm.stats.ttest_ind(dfcontr_azsrate_67['amount'], dfpilot_azsrate_67['amount'])
print("Two Sample t-test")
print("t =", res[0])
print("p-value =", res[1])
print("df = ", res[2])

if res[1] < 0.05:
    print('There is a statistical significance. There is a difference between amount of rates in 2 expirements')
else: 
    print('There is no statistical significance. There is no difference between amount of rates in 2 expirements')

Two Sample t-test
t = -1.8004469940467165
p-value = 0.07717652394085134
df =  56.0
