## Making the Final Dataset

In [1]:
import pandas as pd

After carefully considering multiple data sources, the final selected features were wrapped up in 2 datasets.

In [None]:
# Scraped data from Numbeo database
numbeo_dataset = pd.read_csv("../Data/combined_numbeo_dataset_adjusted.csv")
# Selected data from Tourism dataset on Kaggle
tourism_dataset = pd.read_csv("../Data/tourism_combined_data.csv")

Merging the datasets. Because we didn't want to introduce more null data, we chose to perform an inner join even if this makes the dataset have less countries. 

In [3]:
combined_df_inner = pd.merge(tourism_dataset,numbeo_dataset, on="Country", how="inner")
combined_df_inner 


Unnamed: 0,Code,Country,tourism_employment_per_1000_over_time,food_employment_per_1000_over_time,tourism_gdp_percentage_over_time,business_to_personal_ratio_over_time,avg_stay_days_over_time,inbound_arrivals_over_time,domestic_tourists_over_time,inbound_to_outbound_ratio_over_time,...,Crime Index_crime_index,Safety Index_crime_index,Health Care Index_health_care_index,Pollution Index_pollution_index,Quality of Life Index_quality_of_life_index,Climate Index_quality_of_life_index,Traffic Index_traffic_index,Time Index(in minutes)_traffic_index,Inefficiency Index_traffic_index,CO2Emission Index_traffic_index
0,ALB,Albania,15.608840,,,0.021680,2.562741,1793.802738,,,...,44.7,55.3,48.2,77.0,104.3,86.4,114.7,36.7,105.7,1492.0
1,DZA,Algeria,7.411779,,1.129921,1.764563,1.621484,,,,...,52.6,47.4,54.5,63.9,,,,,,
2,AGO,Angola,2.439200,0.710283,,0.111823,4.000000,2.001969,,6.428571,...,66.3,33.7,,,,,,,,
3,ARG,Argentina,28.270541,11.444930,1.890788,0.140625,2.552286,16.460292,645.361734,0.420933,...,63.4,36.6,68.0,50.8,118.0,98.3,175.4,43.8,190.9,4039.3
4,AUS,Australia,21.464763,7.825559,1.572674,0.280377,2.216665,247.324185,3157.386095,,...,47.3,52.7,73.4,26.8,192.2,93.8,163.0,37.4,237.8,6245.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,GBR,United Kingdom,42.262480,21.678139,2.069407,0.269532,2.170238,116.426500,1830.795881,,...,48.3,51.7,72.7,40.7,174.5,87.2,134.2,34.7,157.4,4112.9
85,USA,United States,13.763542,3.456045,2.875489,0.160518,,69.200533,,0.612504,...,49.2,50.8,67.8,36.7,188.8,78.5,151.1,32.9,234.2,7302.8
86,URY,Uruguay,35.711428,14.620641,7.188250,0.084674,4.496567,913.040069,1685.925742,1.462021,...,52.0,48.0,68.6,43.5,139.8,98.0,150.7,39.5,163.3,3590.9
87,UZB,Uzbekistan,1.545283,,,0.021544,2.551356,,,,...,27.9,72.1,,54.0,,,,,,


The dataset now has data on 89 countries out of the 195 countries in the world.

After careful consideration of each feature, we decided to drop not relevant or too similar features. 

In [4]:
combined_df_inner.drop(columns=["Rent Index_cost_of_living_index", "Crime Index_crime_index", "CO2Emission Index_traffic_index"], inplace=True)

In [11]:
combined_df_inner.columns

Index(['CountryCode', 'CountryName', 'TourismEmploymentPer1000',
       'FoodEmploymentPer1000', 'TourismGDPPercentage',
       'BusinessToPersonalRatio', 'AverageStayDays', 'InboundArrivalsPer1000',
       'DomesticTouristsPer1000', 'InboundToOutboundRatio',
       'CostOfLivingIndex', 'GroceriesCostIndex', 'RestaurantPriceIndex',
       'SafetyIndex', 'HealthCareIndex', 'PollutionIndex',
       'QualityOfLifeIndex', 'ClimateIndex', 'TrafficIndex',
       'TrafficTimeIndexMinutes', 'TrafficInefficiencyIndex',
       'InverseTrafficInefficiencyIndex', 'InverseTrafficTimeIndexMinutes',
       'InverseTrafficIndex', 'InversePollutionIndex'],
      dtype='object')

Renaming the columns using the same naming conventions.

In [6]:
rename_dict = {
    'Code': 'CountryCode',
    'Country': 'CountryName',
    'tourism_employment_per_1000_over_time': 'TourismEmploymentPer1000',
    'food_employment_per_1000_over_time': 'FoodEmploymentPer1000',
    'tourism_gdp_percentage_over_time': 'TourismGDPPercentage',
    'business_to_personal_ratio_over_time': 'BusinessToPersonalRatio',
    'avg_stay_days_over_time': 'AverageStayDays',
    'inbound_arrivals_over_time': 'InboundArrivalsPer1000',
    'domestic_tourists_over_time': 'DomesticTouristsPer1000',
    'inbound_to_outbound_ratio_over_time': 'InboundToOutboundRatio',
    'Cost of Living Index_cost_of_living_index': 'CostOfLivingIndex',
    'Groceries Index_cost_of_living_index': 'GroceriesCostIndex',
    'Restaurant Price Index_cost_of_living_index': 'RestaurantPriceIndex',
    'Safety Index_crime_index': 'SafetyIndex',
    'Health Care Index_health_care_index': 'HealthCareIndex',
    'Pollution Index_pollution_index': 'PollutionIndex',
    'Quality of Life Index_quality_of_life_index': 'QualityOfLifeIndex',
    'Climate Index_quality_of_life_index': 'ClimateIndex',
    'Traffic Index_traffic_index': 'TrafficIndex',
    'Time Index(in minutes)_traffic_index': 'TrafficTimeIndexMinutes',
    'Inefficiency Index_traffic_index': 'TrafficInefficiencyIndex'
}

In [7]:
combined_df_inner.rename(columns=rename_dict, inplace=True)

Because the majority of the columns have a 0 to positive integer scale, 0 being te lowest score and the positive integer being the highest, we decided to make all the other features increasing monotonic as well.

In [12]:
combined_df_inner["InverseTrafficInefficiencyIndex"] = 1 / combined_df_inner["TrafficInefficiencyIndex"] * 100
combined_df_inner["InverseTrafficTimeIndexMinutes"] = 1 / combined_df_inner["TrafficTimeIndexMinutes"] * 100
combined_df_inner["InverseTrafficIndex"] = 1 / combined_df_inner["TrafficIndex"] * 100
combined_df_inner["InversePollutionIndex"] = 1 / combined_df_inner["PollutionIndex"] * 100

Exporting the dataset.

In [13]:
combined_df_inner.to_csv('../Data/final_dataset.csv', index=False)