In [None]:
!pip install -q geopandas
!apt install -q proj-bin libproj-dev libgeos-dev -y
!pip install -q https://github.com/matplotlib/basemap/archive/master.zip

# Pandas is a package containing additional functions to use data frames in Python
import pandas as pd
import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.basemap import Basemap
import warnings
warnings.simplefilter('ignore')
# These two lines allow the notebook to access the Google Drive.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# This is the path to the project folder within the Google Drive.
file_path = "/content/drive/My Drive/"

Reading package lists...
Building dependency tree...
Reading state information...
libgeos-dev is already the newest version (3.6.2-1build2).
libproj-dev is already the newest version (4.9.3-2).
proj-bin is already the newest version (4.9.3-2).
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
  Building wheel for basemap (setup.py) ... [?25l[?25hdone
Mounted at /content/drive


---
## Notebook 4

## Georeferencing - Automated

Now we can run the georeferencing steps on all the tables.  We can automate this by reading all the species names from a file (in our Drive as species_names.tsv) into a Python list.

In [None]:
species_list = [line.strip() for line in open(file_path + "species_names.tsv")]

We can use a variable to control the species name - this means we only have to change it in one place every time we want to run a different table.

In [None]:
species_list

['Acromyrmex_echinatior',
 'Amblyomma_americanum',
 'Amblyomma_aureolatum',
 'Amblyomma_sculptum',
 'Apis_cerana',
 'Apis_mellifera',
 'Asobara_tabida',
 'Athalia_rosae',
 'Biorhiza_pallida',
 'Bombus_terrestris',
 'Camponotus_castaneus',
 'Camponotus_floridanus',
 'Camponotus_japonicus',
 'Camponotus_ligniperdus',
 'Cardiocondyla_obscurior',
 'Cephus_cinctus',
 'Ceratina_australensis',
 'Cotesia_vestalis',
 'Crematogaster_osakensis',
 'Dermacentor_andersoni',
 'Dermacentor_variabilis',
 'Dinoponera_quadriceps',
 'Exoneurella_tridentata',
 'Fopius_arisanus',
 'Halictus_scabiosae',
 'Harpegnathos_saltator',
 'Ixodes_persulcatus',
 'Ixodes_ricinus',
 'Ixodes_scapularis',
 'Lasius_niger',
 'Linepithema_humile',
 'Lysiphlebus_fabarum',
 'Megachile_rotundata',
 'Megalopta_genalis',
 'Megastigmus_spermotrophus',
 'Messor_barbarus',
 'Messor_capitatus',
 'Messor_hellenius',
 'Messor_structor',
 'Microplitis_demolitor',
 'Monomorium_pharaonis',
 'Nasonia_giraulti',
 'Nasonia_vitripennis',
 'Ne

Read the data table which we created in the data cleaning step into Python.

Using a ```for``` loop we can go through each of these names one by one and run all the georeferencing steps.

This is exactly the same code as we used for a single table - I have just moved it all into one cell to make it easier to run the loop.

I have added two extra lines so that we can combine all the single row summary tables into one long table for all species.

I also changed the order slightly to make it run faster and make the points on the map a bit smaller.

In [None]:
# We will use the country polygons every time - we only need to import them once
# Import polygons representing the shape and location of each country and
# match these to the country names or codes.
# The geometry column of this table contains the information needed to reproduce the shapes of the countries on a map.
worldmap = gpd.read_file(file_path + "/country_boundaries_shapefiles/World_Countries__Generalized_.shp")
worldmap = worldmap.to_crs('epsg:4088')

# Record all the country polygons into a Python dictionary.  This code just links each country code to a polygon in the worldmap table.
country_dict = dict()
for iso, polygon in zip(worldmap['ISO'], worldmap['geometry']):
  country_dict[iso] = polygon

# create an empty DataFrame to store all the summary table rows
all_geo_summary_tables = pd.DataFrame()

# run everything for every species in the list
for species_name in species_list:
        # Everything in this indented section will run once for each species name

        mytab = pd.read_csv(file_path + "/filtered_main_tables/" + species_name + ".csv", sep="\t")
        # Make another summary table to record the results of processing the data.
        # Create an empty dataframe with these columns and with one row for this species
        geosummary = pd.DataFrame(columns=['nrecords_after_data_cleaning', 'n_in_wrong_country', 'n_in_ocean',
                                          'nrecords_final'],
                              index=[species_name])

        # record the original number of records
        geosummary.loc[species_name, 'nrecords_after_data_cleaning'] = len(mytab)

        # Merge the table with the file called “country_codes.tsv” .
        # This will add an additional column - country name.

        another_tab = pd.read_csv(file_path + "country_codes.tsv", sep="\t")
        another_tab['countryCode'] = another_tab['countryCode'].str.strip()


        # The "ISO" column in this table corresponds to the information in the "countryCode" column in our filtered species data table.

        # We can use the map data to check that each point in the species table is actually in the country it is reported as being in.

        # In order to allow Python to interpret geographical data, we need to convert our species dataframe into a "geodataframe".  This converts the points in the 'decimalLongitude' and 'decimalLatitude' columns into points on a map.
        # In order to convert latitude and longitude points into map points we used a map projection - https://en.wikipedia.org/wiki/Map_projection - the points can be converted in different ways depending on the map used.
        # Our species observation points and the country polygons are recorded in projection ESPG 4326 or the "Web Mercator" projection.

        # convert the table to geopandas
        # tell Python that "decimalLongitude" and "decimalLatitude" are geographical points in the EPSG 4326 projection.
        gdf = gpd.GeoDataFrame(mytab,
                              geometry=gpd.points_from_xy(mytab['decimalLongitude'], mytab['decimalLatitude']),
                              crs="epsg:4326")
        # The table now has a "geometry" column with the new information.

        # Converting both columns to a different projection - the "World Equidistant Cylindrical" or ESPG 4088 - just makes the images of the map look more familiar.
        # We also add a couple of extra columns to the table here with the x and y positions in this projection - just for convenience later.
        
        gdf = gdf.to_crs('epsg:4088')

        x_positions_cyl = [x.coords[0][0] for x in gdf['geometry']]
        y_positions_cyl = [x.coords[0][1] for x in gdf['geometry']]
        gdf['x_positions'] = x_positions_cyl
        gdf['y_positions'] = y_positions_cyl


        # We can now plot the points on a map.

        # feel free to change these colours using the codes here: https://htmlcolorcodes.com/
        sea_colour = '#B5F0FC'
        land_colour = '#CDFCB5'
        point_colour = '#8237B9'


        # create an empty set of axis
        f = plt.figure(figsize=(10, 10))

        # add a plot to these axis
        a = f.add_subplot(111, facecolor=sea_colour)

        # plot the countries onto the axis
        worldmap.plot(ax=a, color=land_colour, edgecolor='black', lw=0.3)

        # add the species observation points
        a.scatter(gdf['x_positions'], gdf['y_positions'], s=10, marker="^", color=point_colour)


        # add a title
        a.set_title(species_name.replace("_", " ") + " Unfiltered Points")

        # save a copy
        f.savefig(file_path + "/unfiltered_maps/" + species_name + ".png", dpi=300, bbox_inches='tight')
        plt.close()
        # We want to check that each point has been recorded in the right country.
        # Using the ```within``` function we can check if this point is within the right polygon 
        # Now we can run through each point in the table and check it is in the right country.

        correct_country = []
        # run through every point the table
        for iso, point in zip(gdf['countryCode'], gdf['geometry']):
          if iso in country_dict:
            # get the polygon the point should be inside
            poly = country_dict[iso]
            # check the point is in the polygon
            correct_country.append(point.within(poly))
          else:
            correct_country.append(False)

        # insert this information into the table
        gdf['correct_country'] = correct_country

        # Now the "correct country" column in the table tells us if the point was recorded in the right country or not.
        # We want to save the incorrect countries in a separate table and count how many 
        # there are.
        wrong_country = gdf[gdf['correct_country'] == False]
        wrong_country.to_csv(file_path + "/wrong_country_tables/"  + species_name + ".csv", sep="\t", index=None)

        count_wrong_country = len(wrong_country)

        gdf = gdf[gdf['correct_country'] == True]

        geosummary.loc[species_name, "n_in_wrong_country"] = count_wrong_country

        # Next we want to look for points which are in the ocean - these are unlikely to be correct, especially as they are far from the coast.
        # We need to import a map projection from another package which provides a function to check this.

        # read in a map in the world cylindrical projection
        m = Basemap(projection='cyl',
                    llcrnrlat=-90,
                    urcrnrlat=90,\
                    llcrnrlon=-180,
                    urcrnrlon=180,
                    resolution='l')

        # The ```is_land``` function tells us if the point is on land.
        # We need to run this for every point in the table
        # make a list to store the results
        results = []
        for long, lat in zip(gdf['decimalLongitude'], gdf['decimalLatitude']):
          # check if the point is on land
          point_is_land = m.is_land(long, lat)
          results.append(point_is_land)

        # put this data into the table
        gdf['is_land'] = results

        # Now we can move the "sea" points into a seperate table and filter them out of the main table.
        gdf_sea = gdf[gdf['is_land'] == False]
        geosummary.loc[species_name, 'n_in_ocean'] = len(gdf_sea)
        gdf = gdf[gdf['is_land'] == True]

        # Now we've finished filtering the table, we can make another map with just the good quality points.


        # create an empty set of axis
        f = plt.figure(figsize=(10, 10))

        # add a plot to these axis
        a = f.add_subplot(111, facecolor=sea_colour)

        # plot the countries onto the axis
        worldmap.plot(ax=a, color=land_colour, edgecolor='black', lw=0.3)

        # add the species observation points
        a.scatter(gdf['x_positions'], gdf['y_positions'], s=10, marker="^", color=point_colour)

        # add a title
        a.set_title(species_name.replace("_", " ") + " Filtered Points")

        # save a copy
        f.savefig(file_path + "/filtered_maps/" + species_name + ".png", dpi=300, bbox_inches='tight')
        plt.close()
        # We also want to record the remaining number of points in the summary table.
        geosummary.loc[species_name, 'nrecords_final'] = len(gdf)

        # Finally, we save the clean table and the summary table.
        gdf.to_csv(file_path + "/geo_filtered_main_tables/" + species_name + ".csv", sep="\t", index=None)
        geosummary.to_csv(file_path + "/geo_summary_tables/" + species_name + ".csv", sep="\t")

        # add the summary to the big summary table
        all_geo_summary_tables = all_geo_summary_tables.append(geosummary)

# save the big summary table
all_geo_summary_tables['species_name'] = all_geo_summary_tables.index.values
all_geo_summary_tables.to_csv(file_path + "final_summary_georeferencing.tsv", sep="\t", index=None)