In [3]:
# Pandas is a package containing additional functions to use data frames in Python
import pandas as pd

# These two lines allow the notebook to access the Google Drive.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# This is the path to the project folder within the Google Drive.
file_path = "/content/drive/My Drive/"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


We can use a variable to control the species name - this means we only have to change it in one place every time we want to run a different table.

---

## Notebook 1

# Data Cleaning

This notebook contains instructions on how to clean a GBIF dataset using Python and Pandas for a single species

The species is defined in the following variable

In [2]:
species_name = 'Athalia_rosae'

Read the input table from GBIF into Python.

In [5]:
mytab = pd.read_csv(file_path + "/species_tables/" + species_name + ".csv", sep="\t")

Make a "summary" table to record information about the data as we go along.

In [6]:
# Create an empty dataframe with these columns and with one row for this species
summary = pd.DataFrame(columns=['nrecords_unfiltered', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'wrong_taxon_rank_count',
                                'no_country_code_count', 'no_latlong_count', 'total_removed_data_cleaning', 'nrecords_clean'],
                       index=[species_name])


In [7]:
summary

Unnamed: 0,nrecords_unfiltered,species,genus,family,order,class,phylum,kingdom,wrong_taxon_rank_count,no_country_code_count,no_latlong_count,total_removed_data_cleaning,nrecords_clean
Athalia_rosae,,,,,,,,,,,,,


Count the number of records for this species in the unfiltered input table and record this information in the summary table and write it to the log file.

In [8]:
nrecords = len(mytab)
summary.loc[species_name, 'nrecords_unfiltered'] = nrecords

In [9]:
summary

Unnamed: 0,nrecords_unfiltered,species,genus,family,order,class,phylum,kingdom,wrong_taxon_rank_count,no_country_code_count,no_latlong_count,total_removed_data_cleaning,nrecords_clean
Athalia_rosae,3463,,,,,,,,,,,,


Count the number of unique values in the “kingdom”, “phylum”, “class”, “order”, “family”, “genus” and “species” columns - there should only be one in each.  Print a warning if there is more than one.


In [10]:
# if there is only one kingdom for all the rows
if mytab['kingdom'].nunique() == 1:
  # print this message
  print("All rows are from the same kingdom")
  # record the first value in the kingdom column (as they are all the same)
  kingdom = mytab['kingdom'].values[0]
# if there is > 1
else:
  # write this message to the log file
  print("Warning - the kingdom column has multiple values" )
  # record the kingdom for this species as "NA"
  kingdom = "NA"
summary.loc[species_name, "kingdom"] = kingdom

# repeat for phylum, class, order, family, genus, species
if mytab['phylum'].nunique() == 1:
  print("All rows are from the same phylum")
  phylum = mytab['kingdom'].values[0]
else:
  print("Warning - the phylum column has multiple values")
  phylum = "NA"
summary.loc[species_name, "phylum"] = phylum

if mytab['class'].nunique() == 1:
  print("All rows are from the same class")
  clas = mytab['class'].values[0]
else:
  print("Warning - the class column has multiple values")
  clas = "NA"
summary.loc[species_name, "class"] = clas

if mytab['order'].nunique() == 1:
   print("All rows are from the same order")
   order = mytab['order'].values[0]
else:
   print("Warning - the order column has multiple values")
   order = "NA"
summary.loc[species_name, "order"] = order

if mytab['family'].nunique() == 1:
  print("All rows are from the same family")
  family = mytab['family'].values[0]
else:
  print("Warning - the family column has multiple values")
  family = "NA"
summary.loc[species_name, "family"] = family

if mytab['genus'].nunique() == 1:
   print("All rows are from the same genus")
   genus = mytab['genus'].values[0]
else:
   print ("Warning - the genus column has multiple values")
   genus = "NA"
summary.loc[species_name, "genus"] = genus

if mytab['species'].nunique() == 1:
  print("All rows are from the same species")
  species = mytab['species'].values[0]
else:
  print("Warning - the species column has multiple values")
  species = "NA"
summary.loc[species_name, "species"] = species


All rows are from the same kingdom
All rows are from the same phylum
All rows are from the same class
All rows are from the same order
All rows are from the same family
All rows are from the same genus
All rows are from the same species


In [22]:
summary

Unnamed: 0,nrecords_unfiltered,species,genus,family,order,class,phylum,kingdom,wrong_taxon_rank_count,no_country_code_count,no_latlong_count,total_removed_data_cleaning,nrecords_clean
Athalia_rosae,3463,Athalia rosae,Athalia,Tenthredinidae,Hymenoptera,Insecta,Animalia,Animalia,294,38,158,490,2973


In the taxonRank column, some records are classified as “SPECIES” and some as SUBSPECIES”.  Subspecies is a more specific classification - some researchers will be able to recognise and record different subspecies and others will not.  For now, we will focus on the “SPECIES” records because there are more of them.

Create and save a smaller table of individuals with “SUBSPECIES” in this column.

Remove these individuals from the main table - filter it to keep only records where taxonRank == “SPECIES”.


In [23]:
filtered_tab_subspecies_only = mytab[mytab["taxonRank"] == "SUBSPECIES"]
filtered_tab_subspecies_only.to_csv(file_path + "/subspecies_tables/Athalia_rosae.csv", sep="\t", index=None)

# count how many rows in the unfiltered table have something other than species in this column
count_wrong_taxon_rank = len(mytab[mytab['taxonRank'] != "SPECIES"])
# record this count in the summary table
summary.loc[species_name, "wrong_taxon_rank_count"] = count_wrong_taxon_rank

# filter out all rows which don't have "SPECIES" in this column
mytab = mytab[mytab["taxonRank"] == "SPECIES"]

In [24]:
summary

Unnamed: 0,nrecords_unfiltered,species,genus,family,order,class,phylum,kingdom,wrong_taxon_rank_count,no_country_code_count,no_latlong_count,total_removed_data_cleaning,nrecords_clean
Athalia_rosae,3463,Athalia rosae,Athalia,Tenthredinidae,Hymenoptera,Insecta,Animalia,Animalia,0,38,158,490,2973



Create and save a smaller table of samples with no value in the “countryCode” column then remove these individuals from the main table.


In [25]:
filtered_table_null = mytab[mytab['countryCode'].isnull()]
filtered_table_null.to_csv(file_path + "/species_tables_null/Athalia_rosae.csv", sep="\t", index=None)

# count how many rows have no country code
count_no_country_code = len(mytab[mytab['countryCode'].isnull()])
summary.loc[species_name, "no_country_code_count"] = count_no_country_code

mytab = mytab[mytab['countryCode'].notnull()]

In [26]:
summary

Unnamed: 0,nrecords_unfiltered,species,genus,family,order,class,phylum,kingdom,wrong_taxon_rank_count,no_country_code_count,no_latlong_count,total_removed_data_cleaning,nrecords_clean
Athalia_rosae,3463,Athalia rosae,Athalia,Tenthredinidae,Hymenoptera,Insecta,Animalia,Animalia,0,0,158,490,2973



Remove rows where latitude or longitude is NA


In [16]:
count_no_latlong = len(mytab[(mytab['decimalLatitude'].isnull()) | (mytab['decimalLongitude'].isnull())])
summary.loc[species_name, 'no_latlong_count'] = count_no_latlong

mytab = mytab[mytab['decimalLatitude'].notnull()]
mytab = mytab[mytab['decimalLongitude'].notnull()]

In [17]:
summary

Unnamed: 0,nrecords_unfiltered,species,genus,family,order,class,phylum,kingdom,wrong_taxon_rank_count,no_country_code_count,no_latlong_count,total_removed_data_cleaning,nrecords_clean
Athalia_rosae,3463,Athalia rosae,Athalia,Tenthredinidae,Hymenoptera,Insecta,Animalia,Animalia,294,38,158,,


We can now count the total number of rows which have been filtered out and add it to the summary table.

In [27]:
total_removed = count_wrong_taxon_rank + count_no_country_code + count_no_latlong
summary.loc[species_name, 'total_removed_data_cleaning'] = total_removed

The last data cleaning step is to add the total number of rows removed in this stage to the summary table.

In [29]:
summary.loc[species_name, "nrecords_clean"] = len(mytab)

In [30]:
summary

Unnamed: 0,nrecords_unfiltered,species,genus,family,order,class,phylum,kingdom,wrong_taxon_rank_count,no_country_code_count,no_latlong_count,total_removed_data_cleaning,nrecords_clean
Athalia_rosae,3463,Athalia rosae,Athalia,Tenthredinidae,Hymenoptera,Insecta,Animalia,Animalia,0,0,158,158,2973


Finally, we save the filtered table and the summary table.

In [31]:
mytab.to_csv(file_path + "/filtered_main_tables/" + species_name + ".csv", sep="\t", index=None)
summary.to_csv(file_path + "/summary_tables/" + species_name + ".csv", sep="\t")