In [None]:
# Imports modules
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import folium
import math
from pyproj import CRS

%matplotlib inline

In [None]:
# Reads national park data, park accessibility data, Flickd post point data and continent csv file data (change data paths)
national_parks_fp = r"C:\Users\jooel\GISproject\data\WDPA_national_parks.shp"
accessibility_fp = r"C:\Users\jooel\GISproject\data\dissolved_national_parks.shp"
flickr_fp = r'C:\Users\jooel\GISproject\data\flickr-in-national-parks.gpkg'
continents_fp = r"C:\Users\jooel\GISproject\data\continent.csv"

In [None]:
# Reads national park data
national_parks = gpd.read_file(national_parks_fp, encoding='utf-8')

In [None]:
# Checks the national park data
print(len(national_parks))
national_parks.head()

In [None]:
# Dissolves parks with different zones to a single row
national_parks = national_parks.dissolve(by=['NAME', 'WDPAID', 'PARENT_ISO', 'ISO3'], aggfunc='sum').reset_index()

In [None]:
print(len(national_parks))
national_parks.head()

In [None]:
# Reads accessibility data
accessibility = gpd.read_file(accessibility_fp, encoding='utf-8')

In [None]:
# Checks the accessibility data
print(len(accessibility))
accessibility.head()

In [None]:
# Filters the accesibility columns from the accessibility data
accessibility_filtered = accessibility[['WDPAID', 'acc_count', 'acc_mean', 'acc_median', 'acc_stdev', 'acc_min', 'acc_max', 'acc_range']]

In [None]:
# Reads the Flickr post data
flickr_data = gpd.read_file(flickr_fp, encoding='utf-8')

In [None]:
# Checks the Flickr data (remember not to share the information)
print(len(flickr_data))
flickr_data.head()

In [None]:
# Removes any duplicates 
flickr_data_cleaned = flickr_data.drop_duplicates(subset=["id", "title", "description", "date_posted", "filename", "photo_url", "geometry"]) 

In [None]:
# Checks the cleaned Flickr data
print(len(flickr_data_cleaned))
flickr_data_cleaned.head()

In [None]:
# Reads the continent information data
continents = pd.read_csv(continents_fp)

In [None]:
# Checks the continent data
print(len(continents))
continents.head()

In [None]:
# Selects needed columns from the continent data
continents = continents[['name', 'alpha-2', 'alpha-3', 'region_remastered', 'region_bigger']]

In [None]:
# Selects needed columns from the national country data
national_parks = national_parks[['WDPAID', 'NAME', 'geometry', 'GIS_AREA', 'Land_area', 'PARENT_ISO', 'ISO3']]


In [None]:
# Renames continent data country code column to match national park country code column
continents = continents.rename(columns={"alpha-3" : "ISO3"})

In [None]:
# Changes the coordinate systems of national park data and flickr data to match each other
national_parks.to_crs(epsg=4326)
flickr_data_cleaned.crs = national_parks.crs

In [None]:
# Checks that the coordinate systems are the same
flickr_data_cleaned.crs == national_parks.crs

In [None]:
# Merges national parks data with continents data
national_parks = pd.merge(national_parks, continents, on='ISO3')

In [None]:
# Checks the merged national parks data
print(len(national_parks))
national_parks.head()

In [None]:
# Creates a new joined dataframe of flickr data and national parks
flickr_joined = gpd.sjoin(flickr_data_cleaned, national_parks, how="inner", op="within")

In [None]:
# Check the new joined data 
print(len(flickr_joined))
flickr_joined.head()

In [None]:
# Checks for duplicate posts
flickr_joined_d = flickr_joined[['id', 'photo_url']]
flickr_duplicates = flickr_joined_d.groupby(['id']).count().reset_index()
flickr_duplicates = flickr_duplicates[flickr_duplicates.photo_url > 1]
print(len(flickr_duplicates))
flickr_duplicates.head()

There are 14 067 posts that are inside borders of two overlapping national parks

In [None]:
# Groups Flickr data by national parks and adds 'number of parks' column
flickr_joined_b = flickr_joined[['WDPAID', 'name', 'NAME', 'GIS_AREA', 'Land_area', 'ISO3', 'region_remastered', 'region_bigger', 'id']]
flickr_joined_b['Number_of_parks'] = 1
flickr_grouped = flickr_joined_b.groupby(['WDPAID', 'name', 'NAME', 'GIS_AREA', 'Land_area', 'ISO3', 'region_remastered', 'region_bigger', 'Number_of_parks']).count().reset_index()

In [None]:
# Joins the accessibility data to the grouped data
flickr_grouped = pd.merge(flickr_grouped, accessibility_filtered, how='left', on='WDPAID')

In [None]:
# Checks the grouped data
print(len(flickr_grouped))
flickr_grouped.head()

In [None]:
# National park Gulf Of Mannar in India is listed twice, so the other has to be dropped
print(flickr_grouped.loc[flickr_grouped['NAME'] == 'Gulf Of Mannar'])

print(flickr_grouped.loc[flickr_grouped['NAME'] == 'Gulf of Mannar'])

In [None]:
# Drop the other Gulf of Mannar row
flickr_grouped = flickr_grouped.drop([968])

In [None]:
print(len(flickr_grouped))

In [None]:
# Counts parks' post densities using the whole area
flickr_grouped['post_density_total'] = flickr_grouped['id']/flickr_grouped['GIS_AREA']

In [None]:
# Counts parks' post densities using land area
flickr_grouped['post_density_land'] = flickr_grouped['id']/flickr_grouped['Land_area']

In [None]:
# Renames name column to Country and id column to represent number of posts inside park polygon and
flickr_grouped = flickr_grouped.rename(columns={"id" : "Number_of_posts", "name" : "Country"})

In [None]:
# Selects columns for a national summary
flickr_summary_national = flickr_grouped[['Country', 'ISO3', 'Number_of_parks', 'GIS_AREA', 'region_remastered', 'region_bigger', 'Land_area', 'Number_of_posts']]

In [None]:
flickr_summary_national

In [None]:
# Groups data by country code
flickr_summary_national = flickr_summary_national.groupby(['Country', 'ISO3', 'region_remastered', 'region_bigger']).sum().reset_index()

In [None]:
# Counts average densities using the whole area and land area seperately on a national scale
flickr_summary_national['post_density_national_total'] = flickr_summary_national['Number_of_posts']/flickr_summary_national['GIS_AREA']  
flickr_summary_national['post_density_national_land'] = flickr_summary_national['Number_of_posts']/flickr_summary_national['Land_area']  

In [None]:
# Checks the national summary data
print(len(flickr_summary_national))
flickr_summary_national.head()

In [None]:
# Calculates descriptive statistics on national level
flickr_summary_stats = flickr_grouped[['Country', 'post_density_total', 'acc_count', 'acc_mean', 'acc_median', 'acc_stdev', 'acc_min', 'acc_max', 'acc_range']]
flickr_summary_national_max = flickr_summary_stats.groupby(['Country']).max().reset_index()
flickr_summary_national_min = flickr_summary_stats.groupby(['Country']).min().reset_index()
flickr_summary_national_mean = flickr_summary_stats.groupby(['Country']).mean().reset_index()
flickr_summary_national_median = flickr_summary_stats.groupby(['Country']).median().reset_index()
flickr_summary_national_var = flickr_summary_stats.groupby(['Country']).var().reset_index()


In [None]:
flickr_summary_national['post_density_total_max'] = flickr_summary_national_max['post_density_total']
flickr_summary_national['post_density_total_min'] = flickr_summary_national_min['post_density_total']
flickr_summary_national['post_density_total_mean'] = flickr_summary_national_mean['post_density_total']
flickr_summary_national['post_density_total_median'] = flickr_summary_national_median['post_density_total']
flickr_summary_national['post_density_total_var'] = flickr_summary_national_var['post_density_total']
flickr_summary_national['acc_mean_national'] = flickr_summary_national_mean['acc_min']
flickr_summary_national['acc_median_national'] = flickr_summary_national_median['acc_min']
flickr_summary_national['acc_var_national'] = flickr_summary_national_var['acc_min']
flickr_summary_national['acc_min_national'] = flickr_summary_national_min['acc_min']
flickr_summary_national['acc_max_national'] = flickr_summary_national_max['acc_min']
flickr_summary_national['acc_range_national'] = flickr_summary_national_max['acc_min'] - flickr_summary_national_min['acc_min']

In [None]:
# Checks national summary with descriptive statistics
flickr_summary_national

In [None]:
# Creates summary on continental level
flickr_summary_continent = flickr_grouped[['region_remastered', 'Number_of_parks', 'GIS_AREA', 'Land_area', 'Number_of_posts']]
flickr_summary_continent = flickr_summary_continent.groupby(['region_remastered']).sum().reset_index()
flickr_summary_continent['post_density_continent_total'] = flickr_summary_continent['Number_of_posts']/flickr_summary_continent['GIS_AREA'] 
flickr_summary_continent['post_density_continent_land'] = flickr_summary_continent['Number_of_posts']/flickr_summary_continent['Land_area'] 

In [None]:
# Checks the continental sumamry data
print(len(flickr_summary_continent))
flickr_summary_continent.head()

In [None]:
# Calculates descriptive statistics on regional level
flickr_summary_stats_2 = flickr_grouped[['region_remastered', 'post_density_total', 'acc_count', 'acc_mean', 'acc_median', 'acc_stdev', 'acc_min', 'acc_max', 'acc_range']]

flickr_summary_continent_max = flickr_summary_stats_2.groupby(['region_remastered']).max().reset_index()
flickr_summary_continent_min = flickr_summary_stats_2.groupby(['region_remastered']).min().reset_index()
flickr_summary_continent_mean = flickr_summary_stats_2.groupby(['region_remastered']).mean().reset_index()
flickr_summary_continent_median = flickr_summary_stats_2.groupby(['region_remastered']).median().reset_index()
flickr_summary_continent_var = flickr_summary_stats_2.groupby(['region_remastered']).var().reset_index()

In [None]:
flickr_summary_continent['post_density_continent_total_max'] = flickr_summary_continent_max['post_density_total']
flickr_summary_continent['post_density_continent__total_min'] = flickr_summary_continent_min['post_density_total']
flickr_summary_continent['post_density_continent_total_mean'] = flickr_summary_continent_mean['post_density_total']
flickr_summary_continent['post_density_continent_total_median'] = flickr_summary_continent_median['post_density_total']
flickr_summary_continent['post_density_continent_total_var'] = flickr_summary_continent_var['post_density_total']
flickr_summary_continent['acc_mean_continental'] = flickr_summary_continent_mean['acc_min']
flickr_summary_continent['acc_median_continental'] = flickr_summary_continent_median['acc_min']
flickr_summary_continent['acc_var_continental'] = flickr_summary_continent_var['acc_min']
flickr_summary_continent['acc_min_continental'] = flickr_summary_continent_min['acc_min']
flickr_summary_continent['acc_max_continental'] = flickr_summary_continent_max['acc_min']
flickr_summary_continent['acc_range_continental'] = flickr_summary_continent_max['acc_min'] - flickr_summary_continent_min['acc_min']

In [None]:
# Checks continental summary with descriptive statistics
flickr_summary_continent

In [None]:
# Creates alternative continental level summary
flickr_summary_largecontinent = flickr_grouped[['region_bigger', 'Number_of_parks', 'GIS_AREA', 'Land_area', 'Number_of_posts']]
flickr_summary_largecontinent = flickr_summary_largecontinent.groupby(['region_bigger']).sum().reset_index()
flickr_summary_largecontinent['post_density_continent_total'] = flickr_summary_largecontinent['Number_of_posts']/flickr_summary_largecontinent['GIS_AREA'] 
flickr_summary_largecontinent['post_density_continent_land'] = flickr_summary_largecontinent['Number_of_posts']/flickr_summary_largecontinent['Land_area'] 

In [None]:
# Checks the alternative continental summary data
print(len(flickr_summary_largecontinent))
flickr_summary_largecontinent.head()

In [None]:
# Calculates descriptive statistics on continental level
flickr_summary_stats_national_2 = flickr_summary_national[['region_bigger', 'post_density_national_total']]
flickr_summary_largecontinent_max = flickr_summary_stats_national_2.groupby(['region_bigger']).max().reset_index()
flickr_summary_largecontinent_min = flickr_summary_stats_national_2.groupby(['region_bigger']).min().reset_index()
flickr_summary_largecontinent_mean = flickr_summary_stats_national_2.groupby(['region_bigger']).mean().reset_index()
flickr_summary_largecontinent_median = flickr_summary_stats_national_2.groupby(['region_bigger']).median().reset_index()
flickr_summary_largecontinent_var = flickr_summary_stats_national_2.groupby(['region_bigger']).var().reset_index()

In [None]:
flickr_summary_largecontinent['post_density_national_total_max'] = flickr_summary_largecontinent_max['post_density_national_total']
flickr_summary_largecontinent['post_density_national__total_min'] = flickr_summary_largecontinent_min['post_density_national_total']
flickr_summary_largecontinent['post_density_national_total_mean'] = flickr_summary_largecontinent_mean['post_density_national_total']
flickr_summary_largecontinent['post_density_national_total_median'] = flickr_summary_largecontinent_median['post_density_national_total']
flickr_summary_largecontinent['post_density_national_total_var'] = flickr_summary_largecontinent_var['post_density_national_total']

In [None]:
# Checks alternative continental summary with descriptive statistics
flickr_summary_largecontinent.head(8)

In [None]:
# Save all results as csv files
outfp1 = r"C:\Users\jooel\GISproject\results\flickr_parks_density.csv"
outfp2 = r"C:\Users\jooel\GISproject\results\flickr_national_density.csv"
outfp3 = r"C:\Users\jooel\GISproject\results\flickr_continental_density.csv"
outfp4 = r"C:\Users\jooel\GISproject\results\flickr_larger_continental_density.csv"
flickr_grouped.to_csv(outfp1, encoding='utf-8')
flickr_summary_national.to_csv(outfp2)
flickr_summary_continent.to_csv(outfp3)
flickr_summary_largecontinent.to_csv(outfp4)