#**Final project**
- Group 26    : Kyle Crosby (acrosby2), Leyang Li (lli27)
- Topic       : World Tourism Data vs World Development Indicators
- Goal        : Provide plentiful information for user to make informed decisions about travel plans
- Main Data source : http://data.un.org/Explorer.aspx
- Project Website : https://leoreoreo.github.io/EGcomp-Final-Project-Web/

In [1]:
# Setup
!pip install pycountry

import requests, sys, json, csv, math, pycountry
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.io as pio
def get_country_code(country_name):
    try:
        return pycountry.countries.search_fuzzy(country_name)[0].alpha_3
    except LookupError:
        return None

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycountry
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pycountry
  Building wheel for pycountry (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-22.3.5-py2.py3-none-any.whl size=10681847 sha256=9a260f5836444ca3f4e993a0813843c6b27f85ed2fb802eb9303f473a6d4dd0c
  Stored in directory: /root/.cache/pip/wheels/03/57/cc/290c5252ec97a6d78d36479a3c5e5ecc76318afcb241ad9dbe
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-22.3.5


**Insight 1**
- Visualizing world tourism using bed spaces density (bed spaces / land area).
- This can provide insight to how suitable the country is to travel.
- Data sources:
- - http://data.un.org/DocumentData.aspx?id=482
- - https://data.worldbank.org/indicator/AG.LND.TOTL.K2?end=2020&start=2020&view=chart

In [7]:
# read populations
pop = open('API_AG.LND.TOTL.K2_DS2_en_csv_v2_5174925.csv')
pop_reader = csv.reader(pop)
pop_data = list(pop_reader)[5:]
pop.close()

popdic = {}   
for r in pop_data:
  cur = []
  n = r[39]
  if n == '':   # clean data
    cur.append(None)
  else:
    cur.append(float(n))
  for i in range(40, 66):   # get population of intended years
    n = r[i]
    if n == '':
      cur.append(r[i-1])    # assume the population stays the same as last year
    else:
      cur.append(float(n)) 

  popdic[r[1]] = cur
#print(popdic)

{'ABW': [180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, 180.0, '180'], 'AFE': [14571611.0, 14571611.0, 14571611.0, 14571611.0, 14571611.0, 14571611.0, 14571611.0, 14571611.0, 14571611.0, 14720190.922, 14720236.092, 14720228.252, 14720274.665, 14720236.89, 14720269.284, 14720957.055, 14721240.05, 14845170.142, 14845132.326, 14845085.467, 14845139.128, 14845146.968, 14845135.053, 14845150.143, 14845162.404, 14845123.094, '14845123.094'], 'AFG': [652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, 652230.0, '652230'], 'AFW': [9045780.0, 9045780.0, 9045780.0, 9045780.0, 9045780.0, 9045780.0, 9045780.0, 9045780.0, 9045780.0, 9045780.0, 9045780.0, 9045780.0, 9045780.0, 90

In [8]:
# read from csv file Tourism Industries.csv
tour_indus = open('Tourism Industries.csv')
tour_indus_reader = csv.reader(tour_indus)
tour_indus_data = list(tour_indus_reader)
tour_indus.close()

countries = []
bed_places = []
years = []
count_code = []
pop = []
for i in range(1, len(tour_indus_data) - 3):  # doesn't need the last 3 rows
  if tour_indus_data[i][0] != '':             # starting row of one country
    cur1 = tour_indus_data[i][0]              # get this country's name
    cur2 = get_country_code(cur1)             # get country code
    for j in range(1995, 2022):   # add rows of year 1995-2021
      countries.append(cur1)        # add country name
      count_code.append(cur2)       # add country code
      if cur2 in popdic: pop.append(popdic[cur2][j - 1995])   # add population
      else: pop.append(None)
  if tour_indus_data[i][2] == 'Number of bed-places':   # row of the country's bed place
    bed_places.append(tour_indus_data[i][7: -1])        # add data of intended years

bed = []    # transform bed datas from 2D list to 1D list, and clean data
for i in range(len(bed_places)):
  for j in range(len(bed_places[i])):
    cur = None if bed_places[i][j] == '..' else int(bed_places[i][j].replace(',', ''))
    bed.append(cur)               # get bed places list
for i in range(len(bed)):         # calculate bed places density
  if bed[i] and pop[i]:
    bed[i] /= float(pop[i])
  else:
    bed[i] = None
for i in range(len(countries) // (2022 - 1995)):      # get years list
  for j in range(1995, 2022):
    years.append(j)

df1 = pd.DataFrame(countries, columns = ['Countries']) # create dataframe
df1['Years'] = years
df1['Density of bed places'] = bed
df1['Country code'] = count_code
df1['Population'] = pop
#print(df1)

In [9]:
# draw choropleth
fig = px.choropleth(df1, locations='Country code', color='Density of bed places',
                    hover_name='Countries', animation_frame='Years', range_color = [0, 0.2])
fig.show()

In [10]:
fh = open('data_viz_div_1.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh.close()
pio.write_html(fig, file='data_viz_div_1.html', auto_open=True, full_html=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Insight 2**
- Comparing cost of business-startup to domestic tourism industries.
- This can provide insight into how easily residents of a country can build tourism-related businesses. 
- Data sources: 
-- http://data.un.org/Data.aspx?d=WDI&f=Indicator_Code%3aIC.REG.COST.PC.ZS
-- http://data.un.org/DocumentData.aspx?id=481 

In [11]:
# read from csv file Inbound Tourism.csv
inb_tour = open('Inbound Tourism.csv')
inb_tour_reader = csv.reader(inb_tour)
inb_tour_data = list(inb_tour_reader)
inb_tour.close()

countries = []
arrivals = []
years = []
#count_code = []

for i in range(1, len(inb_tour_data) - 9):
  if inb_tour_data[i][0] != '':
    cur1 = inb_tour_data[i][0]
    #cur2 = get_country_code(cur1)
    for j in range(2003, 2020):
      countries.append(cur1)      # get country list
      #count_code.append(cur2)
      years.append(j)
  if inb_tour_data[i][2] == 'Total arrivals':
    for num in inb_tour_data[i][16: 33]:
      arrivals.append(None) if num == '..' else arrivals.append(float(num.replace(',', '')))

df2 = pd.DataFrame(countries, columns = ['Countries']) # create dataframe
df2['Years'] = years
df2['Total arrivals'] = arrivals
#df2['Country_codes'] = count_code
print(df2)

        Countries  Years  Total arrivals
0     AFGHANISTAN   2003             NaN
1     AFGHANISTAN   2004             NaN
2     AFGHANISTAN   2005             NaN
3     AFGHANISTAN   2006             NaN
4     AFGHANISTAN   2007             NaN
...           ...    ...             ...
3786     ZIMBABWE   2015          2057.0
3787     ZIMBABWE   2016          2168.0
3788     ZIMBABWE   2017          2423.0
3789     ZIMBABWE   2018          2580.0
3790     ZIMBABWE   2019          2294.0

[3791 rows x 3 columns]


In [12]:
# read from csv file UNdata_Export_20230404_195857867.csv
startup_cost = open('UNdata_Export_20230404_195857867.csv')
startup_cost_reader = csv.reader(startup_cost)
startup_cost_data = list(startup_cost_reader)
startup_cost.close()

countries = []
years = []
cost = []

for i in range(1, len(startup_cost_data)):
  if (len(startup_cost_data[i]) < 3): continue
  countries.append(startup_cost_data[i][0].upper())
  years.append(int(startup_cost_data[i][1]))
  cost.append(float(startup_cost_data[i][2]))

for i in range(len(countries)): # add column
    df2.loc[(df2['Countries'] == countries[i]) & (df2['Years'] == years[i]), 'Business-startup Cost'] = cost[i]
#print(df2)

In [13]:
fig = px.scatter(df2, x='Business-startup Cost', y='Total arrivals', log_x = True, log_y = True,
                  hover_name='Countries', animation_frame='Years')
fig.show()

In [None]:
fh = open('data_viz_div_2.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh.close()
pio.write_html(fig, file='data_viz_div_2.html', auto_open=True, full_html=False)

**Insight 3**
- Comparing outgoing and incoming tourism to air transit passenger capacity.
- This can provide insight into how much plane travel affects tourism rates of countries. 
- Data source: 
- - http://data.un.org/Data.aspx?d=WDI&f=Indicator_Code%3aIS.AIR.PSGR
- - http://data.un.org/DocumentData.aspx?id=481
- - http://data.un.org/DocumentData.aspx?id=458 

In [None]:
# read from csv file Outbound Tourism.csv
outb_tour = open('Outbound Tourism.csv')
outb_tour_reader = csv.reader(outb_tour)
outb_tour_data = list(outb_tour_reader)
outb_tour.close()

countries = []
departures = []
years = []
#count_code = []

for i in range(1, len(outb_tour_data) - 9):
  if outb_tour_data[i][0] != '':
    cur1 = outb_tour_data[i][0]
    #cur2 = get_country_code(cur1)
    for j in range(2003, 2021):
      countries.append(cur1)      # get country list
      #count_code.append(cur2)
      years.append(j)
  if outb_tour_data[i][2] == 'Total departures':
    for num in outb_tour_data[i][16: 34]:
      departures.append(None) if num == '..' else departures.append(float(num.replace(',', '')))

df3 = pd.DataFrame(countries, columns = ['Countries']) # create dataframe
df3['Years'] = years
df3['Total departures'] = departures
#df3['Country_codes'] = count_code
#print(df3)
#print(departures)

In [None]:
# read from csv file Inbound Tourism.csv
inb_tour = open('Inbound Tourism.csv')
inb_tour_reader = csv.reader(inb_tour)
inb_tour_data = list(inb_tour_reader)
inb_tour.close()

countries = []
arrivals = []
years = []
#count_code = []

for i in range(1, len(inb_tour_data) - 9):
  if inb_tour_data[i][2] == 'Total arrivals':
    for num in inb_tour_data[i][16: 34]:
      arrivals.append(None) if num == '..' else arrivals.append(float(num.replace(',', '')))

df3['Total arrivals'] = arrivals
print(df3)

        Countries  Years  Total departures  Total arrivals
0     AFGHANISTAN   2003               NaN             NaN
1     AFGHANISTAN   2004               NaN             NaN
2     AFGHANISTAN   2005               NaN             NaN
3     AFGHANISTAN   2006               NaN             NaN
4     AFGHANISTAN   2007               NaN             NaN
...           ...    ...               ...             ...
4009     ZIMBABWE   2016               NaN          2168.0
4010     ZIMBABWE   2017               NaN          2423.0
4011     ZIMBABWE   2018               NaN          2580.0
4012     ZIMBABWE   2019               NaN          2294.0
4013     ZIMBABWE   2020               NaN           639.0

[4014 rows x 4 columns]


In [None]:
# read from csv file UNdata_Export_20230405_015906295.csv
air_passenger = open('UNdata_Export_20230405_015906295.csv')
air_passenger_reader = csv.reader(air_passenger)
air_passenger_data = list(air_passenger_reader)
air_passenger.close()

countries = []
years = []
passenger = []

for i in range(1, len(air_passenger_data) - 18):
  if (len(air_passenger_data[i]) < 3): break
  countries.append(air_passenger_data[i][0].upper())
  years.append(int(air_passenger_data[i][1]))
  if air_passenger_data[i][2] != '':
    passenger.append(float(air_passenger_data[i][2]))
  else:
    passenger.append(None)

for i in range(len(countries)):   # add column
    df3.loc[(df3['Countries'] == countries[i]) & (df3['Years'] == years[i]), 'Air Transit Passenger'] = passenger[i]
#print(df3)

In [None]:
fig_arrivals = px.scatter(df3, x="Air Transit Passenger", y="Total arrivals", 
                            animation_frame="Years", hover_name="Countries", log_x = True, log_y = True,
                            title="Total arrivals vs air transit passengers, animated by year")
fig_departures = px.scatter(df3, x="Air Transit Passenger", y="Total departures", 
                            animation_frame="Years", hover_name="Countries", log_x = True, log_y = True,
                            title="Total departures vs air transit passengers, animated by year")
fig_arrivals.show()
fig_departures.show()

In [None]:
fh = open('data_viz_div_3_1.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh.close()
pio.write_html(fig_arrivals, file='data_viz_div_3_1.html', auto_open=True, full_html=False)
fh1 = open('data_viz_div_3_2.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh1.close()
pio.write_html(fig_departures, file='data_viz_div_3_2.html', auto_open=True, full_html=False)

**Insight 4**
- Comparing ingoing tourism to Gender Development Index. 
- This will provide insight into how willing people are to visit a country depending on the country's gender equality. 
- Data sources: 
-- http://data.un.org/Explorer.aspx?d=19&f=docID:477
-- http://data.un.org/DocumentData.aspx?id=481
-- http://data.un.org/DocumentData.aspx?id=483 

In [None]:
# Insight 4

import csv
import pandas as pd
# read from csv file Inbound Tourism.csv
inb_tour = open('Inbound Tourism.csv')
inb_tour_reader = csv.reader(inb_tour)
inb_tour_data = list(inb_tour_reader)
inb_tour.close()

countries = []
arrivals = []
years = []
#count_code = []

for i in range(1, len(inb_tour_data) - 9):
  if inb_tour_data[i][2] == 'Total arrivals':
    for num in inb_tour_data[i][16: 35]:
      arrivals.append(None) if num == '..' else arrivals.append(float(num.replace(',', '')))

df4 = pd.DataFrame(countries, columns = ['Countries']) # create dataframe
df4['Years'] = years
df4['Total arrivals'] = arrivals
print(df4)

     Countries  Years  Total arrivals
0          NaN    NaN             NaN
1          NaN    NaN             NaN
2          NaN    NaN             NaN
3          NaN    NaN             NaN
4          NaN    NaN             NaN
...        ...    ...             ...
4232       NaN    NaN          2423.0
4233       NaN    NaN          2580.0
4234       NaN    NaN          2294.0
4235       NaN    NaN           639.0
4236       NaN    NaN           381.0

[4237 rows x 3 columns]


In [None]:
# Insight 4 
# Inbound Tourism data collection/cleaning
import csv
import pandas as pd

inbound_csv = open('Inbound Tourism.csv')
inbound_reader = csv.reader(inbound_csv)
inbound_data = list(inbound_reader)
# 2021 is column 34 (index number)
print(inbound_data[2][0])
last_country_name = ''
inbound_dict = {}
#for i in range(8, 35):
  #print(inbound_data[0][i])
for j in range(1, len(inbound_data)):
  row = inbound_data[j]
  # Get last country name
  if row[0] != '':
    last_country_name = row[0]
  # Find total arrivals row and get value if there is one. 
  if row[2] == 'Total arrivals' and row[34][0].isdigit():
    num = row[34]
    new_num = ''
    for i in list(num):
      if i.isdigit():
        new_num += i
    inbound_dict[last_country_name] = int(new_num)

#print(inbound_dict)

countries_list = list(inbound_dict.keys())
inbound_country_codes = []
for i in countries_list:
  s = i
  s = s.lower()
  s = s.capitalize()
  code = get_country_code(s)
  inbound_country_codes.append(code)

df4 = pd.DataFrame(countries_list, columns = ['Countries'])
arrivals_list = []
for i in countries_list:
  arrivals_list.append(inbound_dict[i])
df4['Total Arrivals'] = arrivals_list
df4['Country Codes'] = inbound_country_codes
print(df4)


                        Countries  Total Arrivals Country Codes
0                         ALBANIA            5689           ALB
1                         ALGERIA             125           DZA
2                         ANDORRA            5422           AND
3                        ANGUILLA              29           AIA
4             ANTIGUA AND BARBUDA             250           ATG
..                            ...             ...           ...
105  UNITED STATES VIRGIN ISLANDS            1070          None
106                       URUGUAY             234           URY
107                    UZBEKISTAN            1881           UZB
108                      VIET NAM             157           VNM
109                      ZIMBABWE             381           ZWE

[110 rows x 3 columns]


In [None]:
# Get Inbound Tourism with years

import csv
import pandas as pd

inbound_csv = open('Inbound Tourism.csv')
inbound_reader = csv.reader(inbound_csv)
inbound_data = list(inbound_reader)
# 2021 is column 34 (index number)
print(inbound_data[2][0])
last_country_name = ''
inbound_dict = {}
#for i in range(8, 35):
  #print(inbound_data[0][i])
for j in range(1, len(inbound_data)):
  row = inbound_data[j]
  # Get last country name
  if row[0] != '':
    last_country_name = row[0]
  # Find total arrivals row and get value if there is one. 
  if row[2] == 'Total arrivals':
    # Loop through all of the years
    year = 1995
    for k in range(8, 35):
      if row[k][0].isdigit():
        num = row[k]
        new_num = ''
        for i in list(num):
          if i.isdigit():
            new_num += i
        if last_country_name not in list(inbound_dict.keys()):
          inbound_dict[last_country_name] = []
          inbound_dict[last_country_name].append([year, int(new_num)])
        else:
          inbound_dict[last_country_name].append([year, int(new_num)])
      year += 1

#print(inbound_dict)

countries_list = list(inbound_dict.keys())
#df4 = pd.DataFrame(countries_list, columns = ['Countries'])
data_countries_list = []
inbound_country_codes = []
arrivals_list = []
years_list = []
for i in countries_list:
  for j in inbound_dict[i]:
    data_countries_list.append(i)
    arrivals_list.append(j[1])
    years_list.append(j[0])
    # Get country code for choropleth
    s = i
    s = s.lower()
    s = s.capitalize()
    code = get_country_code(s)
    inbound_country_codes.append(s)

df4 = pd.DataFrame(data_countries_list, columns = ['Countries'])
df4['Year'] = years_list
df4['Total Arrivals (thousands)'] = arrivals_list
df4['Country Code'] = inbound_country_codes
print(df4)


     Countries  Year  Total Arrivals (thousands) Country Code
0      ALBANIA  1995                         304      Albania
1      ALBANIA  1996                         287      Albania
2      ALBANIA  1997                         119      Albania
3      ALBANIA  1998                         184      Albania
4      ALBANIA  1999                         371      Albania
...        ...   ...                         ...          ...
3507  ZIMBABWE  2017                        2423     Zimbabwe
3508  ZIMBABWE  2018                        2580     Zimbabwe
3509  ZIMBABWE  2019                        2294     Zimbabwe
3510  ZIMBABWE  2020                         639     Zimbabwe
3511  ZIMBABWE  2021                         381     Zimbabwe

[3512 rows x 4 columns]


In [None]:
# Get gender development data
import csv

gender_csv = open('Gender Development Index.csv')
gender_reader = csv.reader(gender_csv)
gender_data = list(gender_reader)
gender_csv.close()

gender_dev_dict = {}

for i in range(5, len(gender_data)):
  row = gender_data[i]
  if row[2] != '' and row[2] != '..':
    gender_dev_dict[row[1].upper()] = float(row[2])

#print(gender_dev_dict)

gender_list = []
#print(countries_list)
gender_countries_list = []
for i in list(gender_dev_dict.keys()):
  gender_countries_list.append(i.upper())
#print(gender_countries_list)
for i in countries_list:
  if i in gender_countries_list:
    gender_list.append(gender_dev_dict[i])
  else:
    gender_list.append(0)

df4['Gender Dev'] = gender_list
print(df4)

                        Countries  Total Arrivals Country Codes  Gender Dev
0                         ALBANIA            5689           ALB       1.007
1                         ALGERIA             125           DZA       0.880
2                         ANDORRA            5422           AND       0.000
3                        ANGUILLA              29           AIA       0.000
4             ANTIGUA AND BARBUDA             250           ATG       0.000
..                            ...             ...           ...         ...
105  UNITED STATES VIRGIN ISLANDS            1070          None       0.000
106                       URUGUAY             234           URY       1.022
107                    UZBEKISTAN            1881           UZB       0.944
108                      VIET NAM             157           VNM       1.002
109                      ZIMBABWE             381           ZWE       0.961

[110 rows x 4 columns]


In [None]:
# Visualization for Insight 4
gender_fig = px.scatter(df4, x='Gender Dev', y='Total Arrivals', log_x = True, log_y = True,
                  hover_name='Countries')
gender_fig.show()

In [None]:
fh = open('data_viz_div_4.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh.close()
pio.write_html(gender_fig, file='data_viz_div_4.html', auto_open=True, full_html=False)

# New Section

**Insight 5**
- Comparing incoming tourism to AIDS-related deaths.
- This will provide insight into how much the AIDS epidemic deters tourists from visiting countries. 
- Data source: 
-- http://data.un.org/Explorer.aspx?d=19&f=docID:477
-- http://data.un.org/DocumentData.aspx?id=481
-- http://data.un.org/Data.aspx?d=UNAIDS&f=inID%3a33 

In [None]:
# Get inbound tourism data between years 1995 and 2014 (constrained by available AIDS data)

import csv
import pandas as pd

inbound_csv = open('Inbound Tourism.csv')
inbound_reader = csv.reader(inbound_csv)
inbound_data = list(inbound_reader)
# 2021 is column 34 (index number)
print(inbound_data[2][0])
last_country_name = ''
inbound_dict = {}
#for i in range(8, 35):
  #print(inbound_data[0][i])
for j in range(1, len(inbound_data)):
  row = inbound_data[j]
  # Get last country name
  if row[0] != '':
    last_country_name = row[0]
  # Find total arrivals row and get value if there is one. 
  if row[2] == 'Total arrivals':
    # Loop through all of the years
    year = 1995
    for k in range(8, 28):
      if row[k][0].isdigit():
        num = row[k]
        new_num = ''
        for i in list(num):
          if i.isdigit():
            new_num += i
        if last_country_name not in list(inbound_dict.keys()):
          inbound_dict[last_country_name] = []
          inbound_dict[last_country_name].append([year, int(new_num)])
        else:
          inbound_dict[last_country_name].append([year, int(new_num)])
      year += 1

#print(inbound_dict)

countries_list = list(inbound_dict.keys())
#df4 = pd.DataFrame(countries_list, columns = ['Countries'])
data_countries_list = []
arrivals_list = []
years_list = []
for i in countries_list:
  for j in inbound_dict[i]:
    data_countries_list.append(i)
    arrivals_list.append(j[1])
    years_list.append(j[0])

df4 = pd.DataFrame(data_countries_list, columns = ['Countries'])
df4['Year'] = years_list
df4['Total Arrivals (thousands)'] = arrivals_list
print(df4)


     Countries  Year  Total Arrivals (thousands)
0      ALBANIA  1995                         304
1      ALBANIA  1996                         287
2      ALBANIA  1997                         119
3      ALBANIA  1998                         184
4      ALBANIA  1999                         371
...        ...   ...                         ...
2595  ZIMBABWE  2010                        2239
2596  ZIMBABWE  2011                        2423
2597  ZIMBABWE  2012                        1794
2598  ZIMBABWE  2013                        1833
2599  ZIMBABWE  2014                        1880

[2600 rows x 3 columns]


In [None]:
from ast import Index
# Insight 5
# Collecting AIDS-related deaths into Data Frame
import csv

aids_csv = open('AIDS_UN_DATA.csv')
aids_reader = csv.reader(aids_csv)
aids_data = list(aids_reader)
aids_csv.close()

count = 0
aids_dict = {}
for i in aids_data:
  if count < len(aids_data)-1:
    if i[1] == 'All ages estimate':
      if int(i[2]) >= 1995 and int(i[2]) <= 2014:
        if i[0].upper() not in list(aids_dict.keys()):
          aids_dict[i[0].upper()] = []
          aids_dict[i[0].upper()].append([int(i[2]), i[5]])
        else:
          aids_dict[i[0].upper()].append([int(i[2]), i[5]])
  count += 1

aids_countries_list = []
aids_years_list = []
aids_nums_list = []
aids_arrivals_list = []
aids_country_codes = []

for i in list(aids_dict.keys()):
  for j in aids_dict[i]:
    if i in list(inbound_dict.keys()):
      for k in inbound_dict[i]:
        if k[0] == j[0]:
          aids_countries_list.append(i)
          aids_years_list.append(int(j[0]))
          aids_nums_list.append(int(j[1]))
          aids_arrivals_list.append(int(k[1]))
          # Get country codes
          s = i
          s = i.lower()
          s = i.capitalize()
          code = get_country_code(s)
          aids_country_codes.append(code)


df5 = pd.DataFrame(aids_countries_list, columns = ['Countries'])
df5['Years'] = aids_years_list
df5['AIDS-related Deaths'] = aids_nums_list
df5['Total Arrivals (thousands)'] = aids_arrivals_list
df5['Country Codes'] = aids_country_codes

print(df5)


     Countries  Years  AIDS-related Deaths  Total Arrivals (thousands)  \
0      ALGERIA   2014                  186                        2301   
1      ALGERIA   2013                  172                        2733   
2      ALGERIA   2012                  166                        2634   
3      ALGERIA   2011                  141                        2395   
4      ALGERIA   2010                  127                        2070   
...        ...    ...                  ...                         ...   
1010  ZIMBABWE   1999               112572                        2250   
1011  ZIMBABWE   1998               100470                        2090   
1012  ZIMBABWE   1997                87061                        1336   
1013  ZIMBABWE   1996                73183                        1597   
1014  ZIMBABWE   1995                59601                        1416   

     Country Codes  
0              DZA  
1              DZA  
2              DZA  
3              DZA  
4     

In [None]:
# Create visualization for Insight 5

# Scatter plot
aids_scatter_fig = px.scatter(df5, x='AIDS-related Deaths', y='Total Arrivals (thousands)',
                              animation_frame='Years', hover_name='Countries', log_x=True, log_y=True,
                              title='Total AIDS-related Deaths vs Inbound Tourists, animated by year')
aids_scatter_fig.show()

# Choropleth graph
aids_choro_fig = px.choropleth(df5, locations='Country Codes', color='AIDS-related Deaths', hover_name='Countries',
                              animation_frame='Years', title='AIDS-related Deaths globally', hover_data=['Total Arrivals (thousands)'])
aids_choro_fig.show()

aids_choro_fig2 = px.choropleth(df5, locations='Country Codes', color='Total Arrivals (thousands)', hover_name='Countries',
                              animation_frame='Years', title='Inbound Tourism by Total Arrivals globally', hover_data=['AIDS-related Deaths'])
aids_choro_fig2.show()

In [None]:
fh = open('data_viz_div_5_1.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh.close()
pio.write_html(aids_scatter_fig, file='data_viz_div_5_1.html', auto_open=True, full_html=False)

fh = open('data_viz_div_5_2.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh.close()
pio.write_html(aids_choro_fig, file='data_viz_div_5_2.html', auto_open=True, full_html=False)

fh = open('data_viz_div_5_3.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh.close()
pio.write_html(aids_choro_fig2, file='data_viz_div_5_3.html', auto_open=True, full_html=False)

**Insight 6**
- Comparing outgoing tourism and Human Development Index.
- This will provide insight as to how countries with different levels of development are able to support their citizens traveling internationally.
- Data sources:
-- http://data.un.org/Explorer.aspx?d=19&f=docID:477
-- http://data.un.org/DocumentData.aspx?id=458, 

In [None]:
# Insight 6
# Get outbound tourism data

import csv
import pandas as pd

outbound_csv = open('Outbound Tourism.csv')
outbound_reader = csv.reader(outbound_csv)
outbound_data = list(outbound_reader)

outbound_dict = {}
outbound_countries_list = []
last_country_name = ''
for i in range(1, len(outbound_data)):
  row = outbound_data[i]
  if row[0] != '':
    last_country_name = row[0]
    outbound_countries_list.append(row[0])
  # Find total departures and get value if there is one
  if row[2] == 'Total departures':
    if row[34][0].isdigit():
      num = row[34]
      new_num = ''
      for j in num:
        if j.isdigit():
          new_num += j
      outbound_dict[last_country_name] = int(new_num)

print(outbound_dict)
print(len(list(outbound_dict.keys())))
    

{'ALBANIA': 4191, 'ARGENTINA': 1074, 'AUSTRALIA': 301, 'AZERBAIJAN': 975, 'BELARUS': 2616, 'BELGIUM': 10890, 'BULGARIA': 5182, 'CANADA': 6954, 'CHAD': 14, 'CHILE': 519, 'CROATIA': 907, 'CYPRUS': 514, 'EL SALVADOR': 1093, 'FINLAND': 2270, 'FRANCE': 22007, 'GUATEMALA': 437, 'HONG KONG, CHINA': 905, 'HUNGARY': 13258, 'INDONESIA': 1711, 'IRAN, ISLAMIC REPUBLIC OF': 2626, 'IRELAND': 2276, 'ITALY': 17588, 'JAPAN': 512, 'KAZAKHSTAN': 3501, 'KOREA, REPUBLIC OF': 1223, 'KYRGYZSTAN': 1808, 'LATVIA': 629, 'LITHUANIA': 1743, 'MACAO, CHINA': 148, 'MALTA': 344, 'MEXICO': 32836, 'MOLDOVA, REPUBLIC OF': 218, 'OMAN': 1949, 'PARAGUAY': 476, 'ROMANIA': 11643, 'SAUDI ARABIA': 10678, 'SLOVAKIA': 1562, 'SLOVENIA': 3032, 'SPAIN': 8538, 'SWEDEN': 7615, 'SWITZERLAND': 10164, 'TAIWAN PROVINCE OF CHINA': 360, 'TANZANIA, UNITED REPUBLIC OF': 371, 'UKRAINE': 14726, 'UNITED ARAB EMIRATES': 7454, 'UNITED KINGDOM': 19142}
46


In [None]:
# Get HDI data

hdi_csv = open('HDI_DATA.csv')
hdi_reader = csv.reader(hdi_csv)
hdi_data = list(hdi_reader)

hdi_dict = {}

for i in range(6, 205):
  row = hdi_data[i]
  if row[2] != '':
    if row[2][0].isdigit():
      hdi_dict[row[1].upper()] = float(row[2])

print(hdi_dict)

hdi_nums_list = []
hdi_countries_list = []
hdi_country_codes = []
hdi_outbound_list = []

for i in list(hdi_dict.keys()):
  if i in list(outbound_dict.keys()):
    hdi_nums_list.append(hdi_dict[i])
    hdi_countries_list.append(i)
    hdi_outbound_list.append(outbound_dict[i])
    # Get country codes
    s = i
    s = s.lower()
    s = s.capitalize()
    code = get_country_code(s)
    hdi_country_codes.append(code)

df6 = pd.DataFrame(hdi_countries_list, columns=['Countries'])
df6['HDI'] = hdi_nums_list
df6['Total Departures (thousands)'] = hdi_outbound_list
df6['Country Codes'] = hdi_country_codes

print(df6)

{'SWITZERLAND': 0.962, 'NORWAY': 0.961, 'ICELAND': 0.959, 'HONG KONG, CHINA': 0.952, 'AUSTRALIA': 0.951, 'DENMARK': 0.948, 'SWEDEN': 0.947, 'IRELAND': 0.945, 'GERMANY': 0.942, 'NETHERLANDS': 0.941, 'FINLAND': 0.94, 'SINGAPORE': 0.939, 'BELGIUM': 0.937, 'NEW ZEALAND': 0.937, 'CANADA': 0.936, 'LIECHTENSTEIN': 0.935, 'LUXEMBOURG': 0.93, 'UNITED KINGDOM': 0.929, 'JAPAN': 0.925, 'SOUTH KOREA': 0.925, 'UNITED STATES OF AMERICA': 0.921, 'ISRAEL': 0.919, 'MALTA': 0.918, 'SLOVENIA': 0.918, 'AUSTRIA': 0.916, 'UNITED ARAB EMIRATES': 0.911, 'SPAIN': 0.905, 'FRANCE': 0.903, 'CYPRUS': 0.896, 'ITALY': 0.895, 'ESTONIA': 0.89, 'CZECHIA': 0.889, 'GREECE': 0.887, 'POLAND': 0.876, 'BAHRAIN': 0.875, 'LITHUANIA': 0.875, 'SAUDI ARABIA': 0.875, 'PORTUGAL': 0.866, 'LATVIA': 0.863, 'ANDORRA': 0.858, 'CROATIA': 0.858, 'CHILE': 0.855, 'QATAR': 0.855, 'SAN MARINO': 0.853, 'SLOVAKIA': 0.848, 'HUNGARY': 0.846, 'ARGENTINA': 0.842, 'TÜRKIYE': 0.838, 'MONTENEGRO': 0.832, 'KUWAIT': 0.831, 'BRUNEI DARUSSALAM': 0.829, 'RU

In [None]:
# Create visualization for insight 6
hdi_scatter = px.scatter(df6, x='HDI', y='Total Departures (thousands)',
                         hover_name='Countries', log_x=True, log_y=True,
                         title='Human Development Index vs Outbound Tourists')
hdi_scatter.show()

hdi_choro = px.choropleth(df6, locations='Country Codes', color='HDI', hover_name='Countries',
                          title='Human Development Index Globally', hover_data=['Total Departures (thousands)'])
hdi_choro.show()

outbound_choro = px.choropleth(df6, locations='Country Codes', color='Total Departures (thousands)', hover_name='Countries',
                               title='Outbound Tourism Globally (total departures in thousands)', hover_data=['HDI'])
outbound_choro.show()

In [None]:
fh = open('data_viz_div_6_1.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh.close()
pio.write_html(hdi_scatter, file='data_viz_div_6_1.html', auto_open=True, full_html=False)

fh = open('data_viz_div_6_2.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh.close()
pio.write_html(hdi_choro, file='data_viz_div_6_2.html', auto_open=True, full_html=False)

fh = open('data_viz_div_6_3.html', 'w') # this just creates a blank file, so that the write_html() function can write to this file later
fh.close()
pio.write_html(outbound_choro, file='data_viz_div_6_3.html', auto_open=True, full_html=False)