## Data Cleaning for Biodiversity in National Parks

In [175]:
#libraries
import pandas as pd
pd.set_option('display.max_rows', None)

In [176]:
#read cvs and look at head to get a feel for data
observe = pd.read_csv('observations.csv')
print(observe.head())
print(observe.shape)
species = pd.read_csv('species_info.csv')
print(species.head())
print(species.shape)

            scientific_name                            park_name  observations
0        Vicia benghalensis  Great Smoky Mountains National Park            68
1            Neovison vison  Great Smoky Mountains National Park            77
2         Prunus subcordata               Yosemite National Park           138
3      Abutilon theophrasti                  Bryce National Park            84
4  Githopsis specularioides  Great Smoky Mountains National Park            85
(23296, 3)
  category                scientific_name  \
0   Mammal  Clethrionomys gapperi gapperi   
1   Mammal                      Bos bison   
2   Mammal                     Bos taurus   
3   Mammal                     Ovis aries   
4   Mammal                 Cervus elaphus   

                                        common_names conservation_status  
0                           Gapper's Red-Backed Vole                 NaN  
1                              American Bison, Bison                 NaN  
2  Aurochs, Aurochs

Check for duplicates in each datafram prior to merging. Also check for duplicates in the scientific name of the species data frame separate from the conservation status and common names.

In [177]:
#check for duplicates in observations
print(observe[observe.duplicated(keep=False)==True].value_counts())
#remove those duplicates
observe.drop_duplicates(inplace=True)
#check for Nan Values
print(observe.isna().value_counts())
#remove 'National Park' from park_name column
observe['park_name']=observe['park_name'].replace(' National Park', '', regex=True)

scientific_name         park_name                            observations
Arctium minus           Yosemite National Park               162             2
Botrychium virginianum  Yellowstone National Park            232             2
Cichorium intybus       Yellowstone National Park            266             2
Echinochloa crus-galli  Great Smoky Mountains National Park  62              2
Eleocharis palustris    Great Smoky Mountains National Park  62              2
Hesperis matronalis     Bryce National Park                  124             2
Monotropa hypopithys    Great Smoky Mountains National Park  73              2
Plantago major          Great Smoky Mountains National Park  90              2
Poa compressa           Great Smoky Mountains National Park  80              2
Potentilla norvegica    Yosemite National Park               148             2
Prunella vulgaris       Great Smoky Mountains National Park  75              2
Salix exigua            Yosemite National Park           

Investigate data issues in species csv. Look for duplicates, missing values and conflicting data. 

In [178]:
#check for duplicates in species
print('Fully duplicated values in species')
print(species.duplicated().value_counts())
#check for species duplicates that vary only by conservation status
print("Species items that differ only by conservation status")
print(species.duplicated(subset=['scientific_name','category', 'common_names']).value_counts())
print(species[species.duplicated(subset=['scientific_name','category', 'common_names'], keep = False)==True])
#Remove row 560, as Oncorhynchus mykiss are a threatened species, and 3020, as Canis lupus legal status currently list them as Endangered. 
species.drop_duplicates(subset=['scientific_name','category', 'common_names'], keep = 'last', inplace= True)
print(species.duplicated(subset=['scientific_name','category', 'common_names']).value_counts())
#fill species['conservation_status'] values that are missing with 'No Status'
species.fillna(value={'conservation_status': 'No Status'}, inplace=True)
print(species.conservation_status.value_counts())
#check for species duplicates that vary only by common name
print("Species items that differ only by common names")
print(species.duplicated(subset=['scientific_name','category', 'conservation_status']).value_counts())
#show some of the duplicated values to allow checking of the aggregation
print(species['common_names'][species.duplicated(subset=['scientific_name'], keep=False) == True])
#group then aggregate
species= species.groupby(['scientific_name','category', 'conservation_status']).agg({'common_names': lambda x: ', '.join(tuple(x.tolist()))})
species.reset_index(inplace=True)
#print("check aggregated values with spot check of common names for canis lupis (at least on only had gray wolf.)")
print(species[species['scientific_name']=='Canis lupus'])
#It looks like there are duplicate items in the values of species_agg['common_names'] now, so I'll need to remove those. 
species['common_names']= species['common_names'].apply(lambda x: ', '.join(set(x.split(', '))))
print(species[species['scientific_name']=='Canis lupus'])

Fully duplicated values in species
False    5824
dtype: int64
Species items that differ only by conservation status
False    5822
True        2
dtype: int64
     category      scientific_name     common_names conservation_status
560      Fish  Oncorhynchus mykiss    Rainbow Trout                 NaN
3020   Mammal          Canis lupus  Gray Wolf, Wolf         In Recovery
3283     Fish  Oncorhynchus mykiss    Rainbow Trout          Threatened
4448   Mammal          Canis lupus  Gray Wolf, Wolf          Endangered
False    5822
dtype: int64
No Status             5632
Species of Concern     161
Endangered              16
Threatened              10
In Recovery              3
Name: conservation_status, dtype: int64
Species items that differ only by common names
False    5541
True      281
dtype: int64
4                                           Wapiti Or Elk
5                                       White-Tailed Deer
6                                     Feral Hog, Wild Pig
8                  

Since the conservation status values are do not seem to include a 'least concern' category, it's likely that some of the the NaN values in this list are species that do not have a conservation status at this time. Since there's no way to confirm that, for now we've replaced those with the status 'No Status'. Many of the items without a listing are vascular plants, these are less likely to be tracked on the conservation status list as well. 

There are also 281 items that are completely duplicated apart from the common name. These are likely not unique observations, so we're going to drop rows that are duplicates apart from the common names from our dataset. 

The two clean csv's can now be joined on scientific name, outer. 

In [179]:
#merge the two csv, outer, keeping all rows
biodiverse = pd.merge(observe, species, how='outer', on='scientific_name')
print(biodiverse.head())

      scientific_name              park_name  observations        category  \
0  Vicia benghalensis  Great Smoky Mountains            68  Vascular Plant   
1  Vicia benghalensis               Yosemite           148  Vascular Plant   
2  Vicia benghalensis            Yellowstone           247  Vascular Plant   
3  Vicia benghalensis                  Bryce           104  Vascular Plant   
4      Neovison vison  Great Smoky Mountains            77          Mammal   

  conservation_status                        common_names  
0           No Status  Purple Vetch, Reddish Tufted Vetch  
1           No Status  Purple Vetch, Reddish Tufted Vetch  
2           No Status  Purple Vetch, Reddish Tufted Vetch  
3           No Status  Purple Vetch, Reddish Tufted Vetch  
4           No Status                       American Mink  


In [180]:
#check data types and possible issues
print(biodiverse.shape)
print(biodiverse.isna().sum())
print(biodiverse.dtypes)
print(biodiverse.nunique())

(23281, 6)
scientific_name        0
park_name              0
observations           0
category               0
conservation_status    0
common_names           0
dtype: int64
scientific_name        object
park_name              object
observations            int64
category               object
conservation_status    object
common_names           object
dtype: object
scientific_name        5541
park_name                 4
observations            304
category                  7
conservation_status       5
common_names           5236
dtype: int64


We can now see that we're looking at data from four National Parks where they've observed 5541 unique species. The data has been cleaned and tidied, so now we can save it to a new csv, 'biodiversity_data.csv'

In [181]:
biodiverse.to_csv('biodiversity_data.csv')