Extract and Transform Protected Species Data

In [2]:
import pandas as pd

In [3]:
# read in data table 
file_protected = "data/animal_impact.csv"
protected = pd.read_csv(file_protected)
protected

Unnamed: 0,Scientific Name\n,Common Name,Percentage of the species modelled likely and known distribution within fire affected areas,Type,EPBC Act listed Threatened Status,EPBC Act listed Migratory Status,Range states and territories
0,Acacia awestoniana,Stirling Range Wattle,?80%,Plant,Vulnerable,,WA
1,Acacia constablei,Narrabarba Wattle,?80%,Plant,Vulnerable,,NSW
2,Andersonia axilliflora,Giant Andersonia,?80%,Plant,Endangered,,WA
3,Baeckea kandos,a shrub,?80%,Plant,Endangered,,NSW
4,Bertmainius colonus,Eastern Stirling Range Pygmy Trapdoor Spider,?80%,Spider,Vulnerable,,WA
...,...,...,...,...,...,...,...
326,Thesium australe,"Austral Toadflax, Toadflax",10 to <30%,Plant,Vulnerable,,"ACT, NSW, Qld, Tas, Vic"
327,Westringia rupicola,,10 to <30%,Plant,Vulnerable,,"NSW, Qld"
328,Wollumbinia belli,"Bell's Turtle, Western Sawshelled Turtle, Namo...",10 to <30%,Reptile,Vulnerable,,"NSW, Qld"
329,Xerochrysum palustre,"Swamp Everlasting, Swamp Paper Daisy",10 to <30%,Plant,Vulnerable,,"NSW, SA, Tas, Vic"


In [4]:
# rename columns for simplicity
protected.columns # to print out column names for use in creating column dictionary
column_names = {'Scientific Name\n':'scientific',
            'Common Name' : 'common',
            'Percentage of the species modelled likely and known distribution within fire affected areas' : 'distribution',
            'Type':'type',
            'EPBC Act listed Threatened Status':'status',
            'EPBC Act listed Migratory Status':'migratory',
            'Range states and territories':'states'}
protected.rename(columns = column_names, inplace = True)
protected


Unnamed: 0,scientific,common,distribution,type,status,migratory,states
0,Acacia awestoniana,Stirling Range Wattle,?80%,Plant,Vulnerable,,WA
1,Acacia constablei,Narrabarba Wattle,?80%,Plant,Vulnerable,,NSW
2,Andersonia axilliflora,Giant Andersonia,?80%,Plant,Endangered,,WA
3,Baeckea kandos,a shrub,?80%,Plant,Endangered,,NSW
4,Bertmainius colonus,Eastern Stirling Range Pygmy Trapdoor Spider,?80%,Spider,Vulnerable,,WA
...,...,...,...,...,...,...,...
326,Thesium australe,"Austral Toadflax, Toadflax",10 to <30%,Plant,Vulnerable,,"ACT, NSW, Qld, Tas, Vic"
327,Westringia rupicola,,10 to <30%,Plant,Vulnerable,,"NSW, Qld"
328,Wollumbinia belli,"Bell's Turtle, Western Sawshelled Turtle, Namo...",10 to <30%,Reptile,Vulnerable,,"NSW, Qld"
329,Xerochrysum palustre,"Swamp Everlasting, Swamp Paper Daisy",10 to <30%,Plant,Vulnerable,,"NSW, SA, Tas, Vic"


In [5]:
# Explore the data
protected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331 entries, 0 to 330
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   scientific    331 non-null    object
 1   common        271 non-null    object
 2   distribution  331 non-null    object
 3   type          331 non-null    object
 4   status        327 non-null    object
 5   migratory     5 non-null      object
 6   states        331 non-null    object
dtypes: object(7)
memory usage: 18.2+ KB


In [6]:
protected.type.unique()

array(['Plant', 'Spider', 'Mammal', 'Bird', 'Reptile', 'Frog', 'Fish',
       'Insect'], dtype=object)

In [7]:
protected.groupby(["type"]).count()

Unnamed: 0_level_0,scientific,common,distribution,status,migratory,states
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bird,13,13,13,9,5,13
Fish,4,4,4,4,0,4
Frog,14,14,14,14,0,14
Insect,4,4,4,4,0,4
Mammal,16,16,16,16,0,16
Plant,272,212,272,272,0,272
Reptile,7,7,7,7,0,7
Spider,1,1,1,1,0,1


In [8]:
# drop the plant data
protected_animals = protected[protected["type"]!="Plant"]
protected_animals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59 entries, 4 to 328
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   scientific    59 non-null     object
 1   common        59 non-null     object
 2   distribution  59 non-null     object
 3   type          59 non-null     object
 4   status        55 non-null     object
 5   migratory     5 non-null      object
 6   states        59 non-null     object
dtypes: object(7)
memory usage: 3.7+ KB


In [23]:
protected_animals.groupby(["type", "status"])["common"].count()

type     status               
Bird     Critically Endangered    3
         Endangered               4
         Vulnerable               2
Fish     Endangered               3
         Vulnerable               1
Frog     Critically Endangered    3
         Endangered               4
         Vulnerable               7
Insect   Critically Endangered    1
         Endangered               2
         Vulnerable               1
Mammal   Endangered               8
         Vulnerable               8
Reptile  Critically Endangered    2
         Endangered               2
         Vulnerable               3
Spider   Vulnerable               1
Name: common, dtype: int64

The data is not so useful in this format, because there is not each status for each type.
Instead, count by status and combine tables together again.

In [29]:
# count vulnerable animals by type
vulnerable = protected_animals[protected_animals["status"]=="Vulnerable"].groupby("type")["common"].count()
vulnerable

type
Bird       2
Fish       1
Frog       7
Insect     1
Mammal     8
Reptile    3
Spider     1
Name: common, dtype: int64

In [30]:
# repeat for Endangered & Critically Endangered
endangered = protected_animals[protected_animals["status"]=="Endangered"].groupby("type")["common"].count()
critical = protected_animals[protected_animals["status"]=="Critically Endangered"].groupby("type")["common"].count()


In [42]:
# join tables together again
status = pd.concat([vulnerable, endangered, critical], axis = 1)
status.columns = ["vulnerable", "endangered", "critical"]
status["total"] = status.vulnerable + status.endangered + status.critical
status

Unnamed: 0,vulnerable,endangered,critical,total
Bird,2,4.0,3.0,9.0
Fish,1,3.0,,
Frog,7,4.0,3.0,14.0
Insect,1,2.0,1.0,4.0
Mammal,8,8.0,,
Reptile,3,2.0,2.0,7.0
Spider,1,,,


In [43]:
status.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Bird to Spider
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   vulnerable  7 non-null      int64  
 1   endangered  6 non-null      float64
 2   critical    4 non-null      float64
 3   total       4 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 280.0+ bytes


In [47]:
# replace NaN with 0
status.fillna(0).astype(int)


Unnamed: 0,vulnerable,endangered,critical,total
Bird,2,4,3,9
Fish,1,3,0,0
Frog,7,4,3,14
Insect,1,2,1,4
Mammal,8,8,0,0
Reptile,3,2,2,7
Spider,1,0,0,0
