In [2]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import ttest_1samp

In [3]:
# reading data from MySQL
import pymysql
from sqlalchemy import create_engine
import getpass  # To get the password without showing the input
password = getpass.getpass()


········


In [4]:
connection_string = 'mysql+pymysql://root:' + password + '@localhost/galicia_tourism'
engine = create_engine(connection_string)

In [5]:
query = '''SELECT destination, come_from AS origin ,year,month, travellers,nights,ROUND(avg_stay,2)AS avg_stay FROM gal_prov
WHERE destination != 'Spain' AND destination != 'Galicia';'''
df1 = pd.read_sql(query,engine)

In [6]:
df1

Unnamed: 0,destination,origin,year,month,travellers,nights,avg_stay
0,A Coruña,Spain,2001,Jan,41905.0,110996.0,2.65
1,A Coruña,Spain,2001,Feb,43097.0,107170.0,2.49
2,A Coruña,Spain,2001,Mar,52000.0,126981.0,2.44
3,A Coruña,Spain,2001,Apr,62697.0,155462.0,2.48
4,A Coruña,Spain,2001,May,61807.0,148981.0,2.41
...,...,...,...,...,...,...,...
3115,Pontevedra,All,2022,Apr,113910.0,232308.0,2.04
3116,Pontevedra,All,2022,May,130950.0,252678.0,1.93
3117,Pontevedra,All,2022,Jun,155563.0,337605.0,2.17
3118,Pontevedra,All,2022,Jul,225693.0,580363.0,2.57


In [7]:
query2 = '''SELECT g.accommodation_type AS accommodation, g.year, g.month, galicia_travelers AS travellers, galicia_nights AS nights, ROUND(galicia_avg_stay,2) AS avg_stay FROM spain_galicia_per_acc g
LEFT JOIN trav_nights_per_acc t ON
g.galicia_travelers = t.total_travellers AND g.accommodation_type = t.accommodation_type;'''
df2 = pd.read_sql(query2,engine)

In [8]:
df2

Unnamed: 0,accommodation,year,month,travellers,nights,avg_stay
0,Hotels,2001,Jan,112149.0,243320.0,2.20
1,Hotels,2001,Feb,131003.0,269836.0,2.10
2,Hotels,2001,Mar,153081.0,333034.0,2.20
3,Hotels,2001,Apr,192698.0,432250.0,2.20
4,Hotels,2001,May,190637.0,415208.0,2.20
...,...,...,...,...,...,...
3201,Rural Tourism,2022,Apr,5647.0,7273.0,1.29
3202,Rural Tourism,2022,May,9716.0,11792.0,1.21
3203,Rural Tourism,2022,Jun,7503.0,9626.0,1.28
3204,Rural Tourism,2022,Jul,8461.0,16004.0,1.89


In [9]:
df3 = pd.read_sql('spent_country',engine)

In [10]:
df3

Unnamed: 0,come_from,year,trimester,avg_person_day,avg_person,percentage_Galicia,percentage_Spain
0,Germany,2022,1,54.36,194.03,4.68,0.92
1,Germany,2021,4,57.08,195.54,6.19,1.31
2,Germany,2021,3,66.91,206.77,8.06,2.02
3,Germany,2021,2,75.35,171.47,5.64,1.02
4,Germany,2021,1,53.29,148.39,4.55,1.37
...,...,...,...,...,...,...,...
573,Other,2019,1,123.33,318.48,1.98,1.19
574,Other,2018,4,130.81,423.65,2.40,2.03
575,Other,2018,3,101.37,430.98,1.64,1.51
576,Other,2018,2,110.11,358.48,1.87,1.79


In [90]:
query4 = '''SELECT come_from AS 'origin', year, ROUND(AVG(avg_person_day),2) AS avg_person_day_per_year, ROUND(AVG(avg_person),2) AS avg_person_per_year FROM spent_country
GROUP BY come_from, year;'''
spent_country = pd.read_sql(query4,engine)

In [91]:
spent_country

Unnamed: 0,origin,year,avg_person_day_per_year,avg_person_per_year
0,Germany,2022,54.36,194.03
1,Germany,2021,63.16,180.54
2,Germany,2020,49.82,146.59
3,Germany,2019,91.40,301.34
4,Germany,2018,89.89,317.48
...,...,...,...,...
165,Other,2022,73.04,273.90
166,Other,2021,94.76,269.39
167,Other,2020,105.07,295.93
168,Other,2019,117.15,381.71


we select a value for alpha of 0.05 (p-value threshold, significance level)

One-sided test:

Null hypothesis or H0: tourism in Galicia is <=  than before covid (2020)

Alternative hyp or H1: tourism in Galicia is >  than before covid (2020)




I'm going to check: 
1. tourism in Galicia related to amount of travellers, meaning there are or not more travellers than before covid(2020)
2. tourism in Galicia related to amount of nights spent, meaning there are or not more nights spent than before covid(2020)
3. tourism in Galicia related to amount of money spent per person/day, meaning is this amount higher or not than before covid(2020)


# 1. tourism in Galicia related to amount of travellers, meaning there are or not more travellers than before covid(2020)

In [26]:
query1 = '''SELECT destination, come_from,year,ROUND(AVG(travellers),2) AS avg_travellers_per_year FROM gal_prov
WHERE destination != 'Spain' AND destination != 'Galicia' AND come_from = 'All'
GROUP BY destination, year, come_from;'''

provinces = pd.read_sql(query1,engine)

In [27]:
provinces

Unnamed: 0,destination,come_from,year,avg_travellers_per_year
0,A Coruña,All,2001,86054.58
1,A Coruña,All,2002,87426.00
2,A Coruña,All,2003,110291.58
3,A Coruña,All,2004,138141.50
4,A Coruña,All,2005,134659.83
...,...,...,...,...
83,Pontevedra,All,2018,120753.42
84,Pontevedra,All,2019,129828.17
85,Pontevedra,All,2020,50623.67
86,Pontevedra,All,2021,89725.67


In [28]:
query3 = '''SELECT destination, come_from,year,ROUND(AVG(travellers),2) AS avg_travellers_per_year FROM gal_prov
WHERE destination = 'Galicia' AND come_from = 'All'
GROUP BY destination, year, come_from;'''
galicia = pd.read_sql(query3, engine)

In [29]:
galicia

Unnamed: 0,destination,come_from,year,avg_travellers_per_year
0,Galicia,All,2001,218338.08
1,Galicia,All,2002,227660.58
2,Galicia,All,2003,264903.17
3,Galicia,All,2004,319926.42
4,Galicia,All,2005,306592.83
5,Galicia,All,2006,312927.67
6,Galicia,All,2007,325734.83
7,Galicia,All,2008,308036.5
8,Galicia,All,2009,298033.08
9,Galicia,All,2010,332703.0


Null hypothesis or H0: tourism in Galicia pre-covid is >  than post-covid (2020)

Alternative hyp or H1: tourism in Galicia pre-covid is <  than post-covid (2020)



In [81]:
def pre_post(df):
    pre = df[~df['year'].isin([2020,2021,2022])]
    post = df[df['year'].isin([2021,2022])]
    return pre, post

In [82]:
galicia_pre, galicia_post = pre_post(galicia)

In [77]:
galicia_pre = galicia[~galicia['year'].isin([2020,2021,2022])]
galicia_post = galicia[galicia['year'].isin([2021])]

In [78]:
galicia_post['avg_travellers_per_year'].mean()

275917.42

In [83]:
stat, pval = ttest_1samp(galicia_pre['avg_travellers_per_year'],galicia_post['avg_travellers_per_year'].mean())

In [84]:
print('stat is  ', stat)
print('pvalue for the one-tailed test is ', pval/2)

stat is   -2.6914486855780178
pvalue for the one-tailed test is  0.00746003427590564


#### --->  Galicia had more travellers post-covid than pre-covid  <---

# 2. tourism in Galicia related to amount of nights spent, meaning there are or not more nights spent than before covid(2020)

Null hypothesis or H0: tourism in Galicia pre-covid is >  than post-covid (2020)

Alternative hyp or H1: tourism in Galicia pre-covid is <  than post-covid (2020)



In [102]:
query5 = '''SELECT destination, come_from,year,ROUND(AVG(nights),2) AS avg_nights_per_year FROM gal_prov
WHERE destination = 'Galicia' AND come_from = 'All'
GROUP BY destination, year, come_from;'''
galicia_nights = pd.read_sql(query5, engine)

In [103]:
galicia_nights

Unnamed: 0,destination,come_from,year,avg_nights_per_year
0,Galicia,All,2001,506496.67
1,Galicia,All,2002,545486.17
2,Galicia,All,2003,563458.75
3,Galicia,All,2004,684873.5
4,Galicia,All,2005,661080.75
5,Galicia,All,2006,675858.08
6,Galicia,All,2007,700084.25
7,Galicia,All,2008,679548.42
8,Galicia,All,2009,651138.92
9,Galicia,All,2010,713624.5


In [104]:
galicia_pre_n, galicia_post_n = pre_post(galicia_nights)

In [108]:
galicia_post_n['avg_nights_per_year'].mean()

691675.02

In [109]:
stat, pval = ttest_1samp(galicia_pre_n['avg_nights_per_year'],galicia_post_n['avg_nights_per_year'].mean())

In [110]:
print('stat is  ', stat)
print('pvalue for the one-tailed test is ', pval/2)

stat is   -1.9944775026166222
pvalue for the one-tailed test is  0.03073519338847708


#### --->  People have spent more nights in Galicia post-covid than pre-covid  <---

# 3. tourism in Galicia related to amount of money spent per person/day, meaning  is this amount higher or not than before covid(2020)

Null hypothesis or H0: tourism in Galicia pre-covid is >  than post-covid (2020)

Alternative hyp or H1: tourism in Galicia pre-covid is <  than post-covid (2020)



In [85]:
# data = df3[df3['year'].isin([2019])]
# variable = df3[df3['year'].isin([2021])]

In [92]:
spent_pre, spent_post = pre_post(spent_country)

In [94]:
spent_post

Unnamed: 0,origin,year,avg_person_day_per_year,avg_person_per_year
0,Germany,2022,54.36,194.03
1,Germany,2021,63.16,180.54
5,Andorra,2022,82.18,304.10
6,Andorra,2021,72.51,205.71
10,Austria,2022,44.62,163.73
...,...,...,...,...
156,Japan,2021,51.94,148.69
160,Turkey,2022,53.97,197.80
161,Turkey,2021,49.29,144.24
165,Other,2022,73.04,273.90


In [98]:
stat, pval = ttest_1samp(spent_pre['avg_person_day_per_year'], spent_post['avg_person_day_per_year'].mean())

In [100]:
spent_post['avg_person_day_per_year'].mean()

65.25058823529413

In [101]:
print('stat is  ', stat)
print('pvalue for the one-tailed test is ', pval/2)

stat is   4.156959172030338
pvalue for the one-tailed test is  4.680443440019786e-05


#### --->  People from foreign countries had spent more money per person and day during pre-covid than post-covid in their trips to Galicia  <---

# I'd like to check for the Foreign travellers only now

## 1. tourism in Galicia related to amount of FOREIGN travellers, meaning there are or not more travellers than before covid(2020)

In [111]:
query6 = '''SELECT destination, come_from,year,ROUND(AVG(travellers),2) AS avg_travellers_per_year FROM gal_prov
WHERE destination = 'Galicia' AND come_from = 'Foreign'
GROUP BY destination, year, come_from;'''
galicia_F = pd.read_sql(query6, engine)

In [112]:
galicia_F

Unnamed: 0,destination,come_from,year,avg_travellers_per_year
0,Galicia,Foreign,2001,38663.25
1,Galicia,Foreign,2002,40326.33
2,Galicia,Foreign,2003,45202.0
3,Galicia,Foreign,2004,50003.5
4,Galicia,Foreign,2005,53273.17
5,Galicia,Foreign,2006,55870.58
6,Galicia,Foreign,2007,62560.17
7,Galicia,Foreign,2008,59324.58
8,Galicia,Foreign,2009,55095.25
9,Galicia,Foreign,2010,63163.33


Null hypothesis or H0: tourism in Galicia pre-covid is >  than post-covid (2020)

Alternative hyp or H1: tourism in Galicia pre-covid is <  than post-covid (2020)



In [113]:
galicia_F_pre, galicia_F_post = pre_post(galicia_F)

In [114]:
galicia_F_post['avg_travellers_per_year'].mean()

75298.14499999999

In [137]:
galicia_F_pre['avg_travellers_per_year'].mean()

70297.86789473686

In [115]:
stat, pval = ttest_1samp(galicia_pre['avg_travellers_per_year'],galicia_post['avg_travellers_per_year'].mean())

In [116]:
print('stat is  ', stat)
print('pvalue for the one-tailed test is ', pval/2)

stat is   -2.6914486855780178
pvalue for the one-tailed test is  0.00746003427590564


#### --->  Galicia had also more Foreign travellers post-covid than pre-covid  <---

## 2. tourism in Galicia related to amount of nights spentof FOREIGN travellers, meaning there are or not more nights spent than before covid(2020)

In [117]:
query7 = '''SELECT destination, come_from,year,ROUND(AVG(nights),2) AS avg_nights_per_year FROM gal_prov
WHERE destination = 'Galicia' AND come_from = 'Foreign'
GROUP BY destination, year, come_from;'''
galicia_F_nights = pd.read_sql(query7, engine)

In [118]:
galicia_F_nights

Unnamed: 0,destination,come_from,year,avg_nights_per_year
0,Galicia,Foreign,2001,75503.75
1,Galicia,Foreign,2002,81615.92
2,Galicia,Foreign,2003,85108.25
3,Galicia,Foreign,2004,97071.33
4,Galicia,Foreign,2005,102447.42
5,Galicia,Foreign,2006,105550.92
6,Galicia,Foreign,2007,121875.83
7,Galicia,Foreign,2008,116158.67
8,Galicia,Foreign,2009,103907.17
9,Galicia,Foreign,2010,117638.0


Null hypothesis or H0: tourism in Galicia pre-covid is >=  than post-covid (2020)

Alternative hyp or H1: tourism in Galicia pre-covid is <  than post-covid (2020)



In [119]:
galicia_F_pre_n, galicia_F_post_n = pre_post(galicia_F_nights)

In [136]:
galicia_F_pre_n.mean()

  galicia_F_pre_n.mean()


year                     2010.000000
avg_nights_per_year    124685.803684
dtype: float64

In [120]:
galicia_F_post_n['avg_nights_per_year'].mean()

129897.01999999999

In [131]:
galicia_F_pre_n['avg_nights_per_year'].mean()

124685.8036842105

In [121]:
stat, pval = ttest_1samp(galicia_F_pre_n['avg_nights_per_year'],galicia_F_post_n['avg_nights_per_year'].mean())

In [122]:
print('stat is  ', stat)
print('pvalue for the one-tailed test is ', pval/2)

stat is   -0.6451109630156628
pvalue for the one-tailed test is  0.26349657084646705


#### --->  In this case, we can see that there is no significant difference between the nights spent pre-covid and post-covid  <---
#### so I'm going to check the average stay

In [125]:
query8 = '''SELECT destination, come_from,year,ROUND(AVG(avg_stay),2) AS avg_stay_per_year FROM gal_prov
WHERE destination = 'Galicia' AND come_from = 'Foreign'
GROUP BY destination, year, come_from;'''
galicia_F_stay = pd.read_sql(query8, engine)

In [126]:
galicia_F_stay

Unnamed: 0,destination,come_from,year,avg_stay_per_year
0,Galicia,Foreign,2001,1.94
1,Galicia,Foreign,2002,2.03
2,Galicia,Foreign,2003,1.92
3,Galicia,Foreign,2004,1.96
4,Galicia,Foreign,2005,1.94
5,Galicia,Foreign,2006,1.92
6,Galicia,Foreign,2007,2.03
7,Galicia,Foreign,2008,2.02
8,Galicia,Foreign,2009,1.98
9,Galicia,Foreign,2010,1.98


In [127]:
galicia_F_pre_s, galicia_F_post_s = pre_post(galicia_F_stay)

In [128]:
galicia_F_post_s['avg_stay_per_year'].mean()

2.025

In [132]:
galicia_F_pre_s['avg_stay_per_year'].mean()

1.8910526315789478

In [129]:
stat, pval = ttest_1samp(galicia_F_pre_s['avg_stay_per_year'],galicia_F_post_s['avg_stay_per_year'].mean())

In [130]:
print('stat is  ', stat)
print('pvalue for the one-tailed test is ', pval/2)

stat is   -5.2341195643104585
pvalue for the one-tailed test is  2.8045471054816013e-05


#### --->  In this case, the data tells us that the average stay per person per year is higher post-covid than pre-covid  <---
