In [None]:

# Pandas is a package containing additional functions to use data frames in Python
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
import numpy as np
import seaborn as sns
warnings.simplefilter('ignore')
# These two lines allow the notebook to access the Google Drive.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# This is the path to the project folder within the Google Drive.
file_path = "/content/drive/My Drive/"

  import pandas.util.testing as tm


Mounted at /content/drive


In [None]:
!pip install geopandas
import geopandas as gpd



In [None]:
import itertools
import os

---
## Notebook 17
# Calculating overlaps between species

The final piece of analysis we need to do is to look at how the overlap in distribution between species is predicted to change under the different combinations of climate change models and ssps.

We would expect that some species will overlap more and others will overlap less - as the range of different species increases and decreases.

For the purposes of studying virus transmission, we are particularly interested in species which are predicted to overlap more.

First we just create some variables with the lists of models etc - as usual.

In [None]:
species_list = [line.strip() for line in open(file_path + "species_names.tsv")]
models = ['BCC-CSM2-MR',
         'CanESM5']
scenarios = ['ssp126', 'ssp585']

time_periods = ['2021-2040', '2041-2060', '2061-2080', '2081-2100']

sdm = 'rf'


Now we want to look at the overlap between species.

This is fairly straightforward - on the grid representing the map of the world we just count the number of squares which are habitable by both species under each model.

Running every possible combination of species (approximately 70*70 = 4900) and every model / scenario / timepoint combination (2 * 2 * 4 = 16) means there are are large number of comparisons to perform (about 78,000) - so the code needs to be quite efficient - to do this I had to make it a bit more difficult to read.

First we run the present data.

In [None]:
# make empty dictionaries to store the results
resultsD = dict()
totD = dict()


# make an empty matrix with one layer for each species, to store the intermediate results
present = np.empty([len(species_list), 930, 2160], dtype=float)

# for every possible species
for i, species in enumerate(species_list):
    # path to the distribution data for the species
    path = file_path + "SDM_results/" + species + "/present_" + sdm + ".npy"

    # if this exists (a few are missing for various reasons)
    if os.path.exists(path):
      # load the distribution data
      present_matrix = np.load(path)

      # store it in the matrix
      present[i, :, :] = present_matrix

      # store the total number of grid squares habitable by this species
      totD.setdefault(species, dict())
      totD[species]['present'] = np.nansum(present_matrix)

overlapD = dict()
# make every possible combination of pairs of species grids
for pair in itertools.combinations(np.arange(len(species_list)), 2):
  # calculate the total number of squares which are occupied in both grids
  overlap = np.nansum((present[pair[0], :, :] == 1) & (present[pair[1], :, :] == 1))
  # store this in the dictionary
  overlapD[pair] = overlap
resultsD['present'] = overlapD

Then we run all the combinations of other factors

In [None]:
# for every combination of factors
for model in models:
  for ssp in scenarios:
    for timepoint in time_periods:
      overlapD = dict()
      # make a matrix to store the results
      future = np.empty([len(species_list), 930, 2160], dtype=float)

      # for every possible species
      for i, species in enumerate(species_list):
          # build the path to the file
          path = file_path + "SDM_results/" + species + "/" + model + "_" + ssp + "_" + timepoint + "__" + sdm + ".npy"
          # check the file exists
          if os.path.exists(path):
            # load the data
            future_matrix = np.load(path)
            # count the total number of squares which are habitable by this species
            totD[species][model + "_" + ssp + "_" + timepoint] = np.nansum(future_matrix)
            # store the result in the matrix
            future[i, :, :] = future_matrix
      # for every pair of species
      for pair in itertools.combinations(np.arange(len(species_list)), 2):
        # count the squares which are habitable by both species
        overlap = np.nansum((future[pair[0], :, :] == 1) & (future[pair[1], :, :] == 1))
        # store the result
        overlapD[pair] = overlap
      resultsD[model + "_" + ssp + "_" + timepoint] = overlapD

This generates all the results but not in a very usable form, so we transform them into a pandas dataframe.

In [None]:
# store all the combination names in a list
combos = ['present']
for model in models:
  for ssp in scenarios:
    for timepoint in time_periods:
      combos.append(model + "_" + ssp + "_" + timepoint)

results = []

# for every pair of species (for which a result exists)
for i, species1 in enumerate(species_list):
  if species1 in totD:
    for j, species2 in enumerate(species_list):
      if species2 in totD:
        if species1 != species2:
          
          # store the names of the two species
          pair = species1 + "_" + species2

          # start a list to store the results of the analysis
          thismodel = [species1, species2]

          # for every combination of factors (model, ssp, time period)
          for combo in combos:
              # if there is a result for this combination and these two species
              if (i, j) in resultsD[combo] or (j, i) in resultsD[combo]:

                # store the total number of occupied squares for each species
                thismodel.append(totD[species1][combo])
                thismodel.append(totD[species2][combo])

                # its a diagonal matrix so sometimes the indices are the
                # other way around
                try:
                  thismodel.append(resultsD[combo][i, j])
                except:
                  thismodel.append(resultsD[combo][j, i])
              else:
                # store nan for species which do not overlap
                thismodel.append(float('nan'))
                thismodel.append(float('nan'))
                thismodel.append(float('nan'))       
          results.append(thismodel)

# convert everything into a pandas dataframe
r = pd.DataFrame(results)

# generate column names for the dataframe
cols = ['species1', 'species2']
for combo in combos:
  cols.append('species1_total_habitable_squares_' + combo)
  cols.append('speces2_total_habitable_squares_' + combo)
  cols.append('total_shared_habitable_squares_' + combo)
r.columns = cols

Now we can save the results.

In [None]:
r.to_csv(file_path + "overlaps.tsv", sep="\t", index=None)

For each species, as well as the total number of squares which are habitable by both species, it is useful to know, for each species, the proportion of its habitable range which is also habitable by the other species.

For example, it could be that one bee species currently shares 1% of its total range with an ant species, but is predicted to share 20% under a climate change scenario.  This is important in terms of virus transmission - if a virus is present in the ant it would have a much larger impact if the percentage overlap increases.

To do this, we just need to (for every model / ssp / timepoint combination) divide the number of squares the species shares with the other species by the total number of squares it occupies.

In [None]:
propcols = []
# for every combination of factors
for combo in combos:
  # divide the number of shared squares by the total number of habitable squares
  prop = r['total_shared_habitable_squares_' + combo] / r['species1_total_habitable_squares_' + combo]
  r['proportion_species1_habitat_shared_' + combo] = prop
  propcols.append('proportion_species1_habitat_' + combo)

For each species, we can draw a bar chart showing the percentage of squares that the species can inhabit which it shares with each other species.


In [None]:
colours = ['#0C4C8A'] +  ['#FF847C'] * 4 + ['#FECEA8'] * 4 + ['#99B898'] * 4 + ['#2A363B'] * 4

# for each possible species
for species1 in species_list:
  # generate a plot
  f = plt.figure(figsize=(30, 30))
  i = 1
  # take the rows from the table where this species is species1
  thistab = r[r['species1'] == species1]

  # for each other possible species
  for species2 in species_list:
    # take the row where this species is species2 and the other species is species 1
    subtab = thistab[thistab['species2'] == species2]
    # if there is data for this species combination
    if len(subtab) != 0:
      # make an empty list to store the results
      props = []
      # extract the right row from the table
      ind = subtab.index.values[0]
      row = thistab.loc[ind]
      # extract the proportion for every model / ssp / timepoint combination
      for combo in combos:
        props.append(row['proportion_species1_habitat_shared_' + combo] * 100)
      # if there are any results
      if sum(props) != 0:
        # add a subplot
        plt.subplot(10, 6, i)
        # add the bars
        plt.bar(np.arange(len(props)), props, width=0.6, color=colours)
        
        # add the title
        plt.title(species2.replace("_", " "))

        # set axis limits
        plt.xlim(-0.5, len(props) -0.5)
        # find the maximum for the y axis
        maxy = np.nanmax(props)
        plt.ylim(0, maxy*1.3)
        
        # add a line at the level of the present overlap
        plt.hlines(props[0], -0.5, len(props) + 0.5, lw=0.5, ls='dotted', color='red')

        # add short vertical lines between the ssps
        plt.vlines(np.arange(0.5, len(props)+0.5, 4),0, maxy, lw=0.5, ls='dotted')

        # add taller vertical lines between the models
        plt.vlines(np.arange(0.5, len(props)+0.5, 8),0, maxy*1.15, lw=0.5, ls='dotted')

        # extract the time periods from the combination names
        times = [c.split("_")[-1] for c in combos]
        # add the time periods to the x axis
        plt.xticks(np.arange(len(props)), times, rotation='vertical')

        # label the axes
        plt.xlabel("Time Period")
        plt.ylabel('% ' + species1.replace("_", " ") + " \nrange habitable")

        # label the models and ssps
        plt.text(4.5, (maxy * 1.19), 'BCC-CSM2-MR', ha='center', fontsize=8)
        plt.text(12.5, (maxy * 1.19), 'CanESM5', ha='center', fontsize=8)
        plt.text(2.5, (maxy * 1.07), "ssp126", ha='center', fontsize=8)
        plt.text(6.5, (maxy * 1.07), "ssp585", ha='center', fontsize=8)
        plt.text(10.5, (maxy * 1.07), "ssp126", ha='center', fontsize=8)
        plt.text(14.5, (maxy * 1.07), "ssp585", ha='center', fontsize=8)
        i += 1
  plt.tight_layout(pad=3)
  plt.suptitle(species1.replace("_", " "), y=1.01, fontsize=10)
  plt.savefig(file_path + "/overlap_barcharts/" + species1 + ".png", dpi=300, bbox_inches='tight')
  plt.close()
  if i == 1:
    os.unlink(file_path + "/overlap_barcharts/" + species1 + ".png")

We can do the same thing for the total number of squares shared between each pair of species (rather than as a percentage).

This code is the same as above except using the total columns instead of the proportion columns.

In [None]:
for species1 in species_list:

  f = plt.figure(figsize=(30, 30))
  i = 1
  thistab = r[r['species1'] == species1]
  for species2 in species_list:
    subtab = thistab[thistab['species2'] == species2]
    if len(subtab) != 0:
      props = []
      ind = subtab.index.values[0]
      row = thistab.loc[ind]
      for combo in combos:
        props.append(row['total_shared_habitable_squares_' + combo])
      if sum(props) != 0:
        a = f.add_subplot(10, 6, i)
        
        # add the bars
        plt.bar(np.arange(len(props)), props, width=0.6, color=colours)
        
        # add the title
        plt.title(species2.replace("_", " "))

        # set axis limits
        plt.xlim(-0.5, len(props) -0.5)
        # find the maximum for the y axis
        maxy = np.nanmax(props)
        plt.ylim(0, maxy*1.3)
        
        # add a line at the level of the present overlap
        plt.hlines(props[0], -0.5, len(props) + 0.5, lw=0.5, ls='dotted', color='red')

        # add short vertical lines between the ssps
        plt.vlines(np.arange(0.5, len(props)+0.5, 4),0, maxy, lw=0.5, ls='dotted')

        # add taller vertical lines between the models
        plt.vlines(np.arange(0.5, len(props)+0.5, 8),0, maxy*1.15, lw=0.5, ls='dotted')

        # extract the time periods from the combination names
        times = [c.split("_")[-1] for c in combos]
        # add the time periods to the x axis
        plt.xticks(np.arange(len(props)), times, rotation='vertical')

        # label the axes
        plt.xlabel("Time Period")
        plt.ylabel('% ' + species1.replace("_", " ") + " \nrange habitable")

        # label the models and ssps
        plt.text(4.5, (maxy * 1.17), 'BCC-CSM2-MR', ha='center', fontsize=8)
        plt.text(12.5, (maxy * 1.17), 'CanESM5', ha='center', fontsize=8)
        plt.text(2.5, (maxy * 1.07), "ssp126", ha='center', fontsize=8)
        plt.text(6.5, (maxy * 1.07), "ssp585", ha='center', fontsize=8)
        plt.text(10.5, (maxy * 1.07), "ssp126", ha='center', fontsize=8)
        plt.text(14.5, (maxy * 1.07), "ssp585", ha='center', fontsize=8)
        i += 1
  plt.tight_layout(pad=3)
  plt.suptitle(species1.replace("_", " "), y=1.01, fontsize=10)
  plt.savefig(file_path + "/overlap_barcharts/" + species1 + "_total.png", dpi=300, bbox_inches='tight')
  plt.close()
  if i == 1:
    os.unlink(file_path + "/overlap_barcharts/" + species1 + "_total.png")