In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import re
from itertools import chain

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Load the observations dataset
species =  pd.read_csv('species_info.csv')
species.head()

In [None]:
# Load the observations dataset
observations =  pd.read_csv('observations.csv')
observations.head()

### Data Characteristics and Cleaning

---
#### Species File


In [None]:
# Check the dimensions of the species dataset (rows and columns)
print(f'Species (rows, columns) = {species.shape}')

In [None]:
# Count the number of unique species 
print(f'Number of Species: {species.scientific_name.nunique()}')

In [None]:
# Check for missing values in species dataset
print(f'Species file missing Values: \n{species.isnull().sum()} \n') 
# Check for duplicated rows in species dataset
print(f'Species file duplicated Values: \n{species.duplicated().sum()}')

In [None]:
# Explore the conservation_status column
print(f'Conservation Status ({species.conservation_status.nunique()}) : {species.conservation_status.unique()} \n')

In [None]:
# Fill missing values in the conservation_status column with "No Intervention"
species.fillna('No Intervention', inplace = True)

In [None]:
# Explore the categories column
print(f'Categories ({species.category.nunique()}) : {species.category.unique()} \n')

##### Cleaning the rows with same scientific_name

In [None]:
new_rows = []
for _, row in species.iterrows():
    common_names = row['common_names'].split(',')
    for name in common_names:
        new_row = row.copy()  
        new_row['common_names'] = name  
        new_rows.append(new_row)

species = pd.DataFrame(new_rows)

species.duplicated().sum()

In [None]:
species.drop_duplicates(inplace = True)

species = species.groupby(['category', 'scientific_name', 'conservation_status'], as_index=False, sort = False)['common_names'].apply(lambda x: ', '.join(x))
species = species[['category', 'scientific_name', 'common_names', 'conservation_status']]
species.head()

---
#### Observations file


In [None]:
# Check the dimensions of the species dataset (rows and columns)
print(f'Observations (rows, columns) = {observations.shape}')

In [None]:
# Count the number o observations
print(f'Number of Observations: {observations.observations.sum()}')

In [None]:
# Check for missing values in observations dataset
print(f'Observations file missing Values: \n{observations.isnull().sum()} \n')
# Check for duplicated rows in observations dataset
print(f'Observations file duplicated Values: \n{observations.duplicated().sum()}')

In [None]:
# Drop duplicated rows from the observations dataset
observations.drop_duplicates(inplace = True)

In [None]:
# Explore the park_name column 
print(f'Parks ({observations.park_name.nunique()}) : {observations.park_name.unique()} \n')

### Analysis

In [None]:
categories_distribution = species.groupby('category').size()
categories_distribution

In [None]:
conservation_distribution = species.groupby('conservation_status')['scientific_name'].nunique()

In [None]:
total_observations_by_park = observations.groupby('park_name')['observations'].sum()
total_observations_by_park

In [None]:
species_by_park = observations.groupby('park_name')['scientific_name'].nunique()
species_by_park

In [None]:
# Filter species with a conservation status other than 'No Intervention'
threatened_data = species[species['conservation_status'] != 'No Intervention']

# Group by 'category' and 'conservation_status', and count unique species
conservation_by_category = (
    threatened_data.groupby(['category', 'conservation_status'])
    ['scientific_name'].nunique()
    .unstack()
)


conservation_by_category

In [None]:
chi2_cat, p_val, dof_cat, expected_cat = chi2_contingency(conservation_by_category.fillna(0))
print(f'Conservation status by Category p-value: {p_val:.2e}')

In [None]:
palette = sns.color_palette('deep', n_colors = 4)

plt.figure(figsize=(16,10))
conservation_by_category.plot(kind = 'bar', stacked = True, color = palette)
plt.title('Conservation Status by Category - Number of Species')
plt.xlabel('Category')
plt.ylabel('Number of Species')
plt.tick_params(axis = 'x', rotation = 45)
plt.legend(title = 'Conservation Status', bbox_to_anchor=(1.5, 0.5))

plt.show()

In [None]:
threatened_merged_data = observations.merge(threatened_data, on = 'scientific_name')

obs_by_cat_conservation = (
    threatened_merged_data.groupby(['category', 'conservation_status'])['observations']
    .sum()
    .unstack()
)

obs_by_cat_conservation

In [None]:
plt.figure(figsize = (8,6))
sns.heatmap(obs_by_cat_conservation, annot = True, fmt='.0f', linewidth=.8, cmap = 'Reds')

plt.title('Observations by Conservation Status and Category', pad = 20)
plt.xlabel('Conservation Status', labelpad = 15)
plt.ylabel('Category', labelpad = 15)

In [None]:
merged_data = species.merge(observations, on = 'scientific_name').sort_values('observations', ascending = False).reset_index(drop = True)
merged_data.head()

In [None]:
obs_cat_park = (
    merged_data.groupby(['category', 'park_name'])['observations']
    .sum()
    .unstack()
)
obs_cat_park

In [None]:
plt.figure(figsize = (8,6))
sns.heatmap(obs_cat_park, annot = True, fmt='.0f', linewidth=.8, cmap = 'Greens')

plt.title('Observations by Category and Park', pad = 20)
plt.xlabel('Park', labelpad = 10)
plt.xticks(rotation = 45)
plt.ylabel('Category', labelpad = 15)

#### Conservation Status

In [None]:
species['protection_status'] = np.where(
    species['conservation_status'] == 'No Intervention', 
    'Unprotected',
    'Protected'
)

species.head()                                       

In [None]:
conservation = (
    species.groupby(['category', 'protection_status'])['scientific_name']
    .nunique()
    .unstack()
    .reset_index()
)

conservation.columns = (['Category', 'Protected', 'Unprotected'])
conservation

In [None]:
data = conservation[['Protected', 'Unprotected']].values 

chi2_cat, pval, dof_cat, expected_cat = chi2_contingency(data)

print(f'p value: {pval}')


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

#'Protected' plot
ax[0].bar(conservation['Category'], conservation['Protected'], color = '#6a994e')
ax[0].set_title('Protected Species by Category')
ax[0].set_xlabel('Category')
ax[0].set_ylabel('Number of Species')
ax[0].tick_params(axis = 'x', rotation = 45)

#'Unprotected' plot
ax[1].bar(conservation['Category'], conservation['Unprotected'], color = '#c1121f')
ax[1].set_title('Unprotected Species by Category')
ax[1].set_xlabel('Category')
ax[1].set_ylabel('Number of Species')
ax[1].tick_params(axis = 'x', rotation = 45)

plt.tight_layout()
plt.show()


### Study of species in the bird category


In [None]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)


bird_species = species[species['category'] == 'Bird'].reset_index(drop = True)

bird_common_names = (
    bird_species.common_names
    .apply(remove_punctuation)
    .str.split()
    .tolist()
)

In [None]:
clean_bird_names = [list(dict.fromkeys(name)) for name in bird_common_names]

bird_names = list(chain.from_iterable(i if isinstance(i, list) else [i] for i in clean_bird_names))

In [None]:
bird_names_series = pd.Series(bird_names)

names_counted = (
    bird_names_series.value_counts()
    .reset_index()
)

names_counted.columns = ['Name', 'Count']

birds_df = names_counted.sort_values('Count', ascending = False)
birds_df.head()

In [None]:
bird_species['is_warbler'] = bird_species['common_names'].str.contains('Warbler')

In [None]:
warbler_birds = bird_species[bird_species.is_warbler]


In [None]:
warbler_observations = (
    warbler_birds.merge(observations, on = 'scientific_name')
    .sort_values('observations', ascending = False)
    .reset_index(drop = True)
)

warbler_observations.head()

In [None]:
warbler_obs_by_park = (
    warbler_observations.groupby(['park_name', 'protection_status'])['observations']
    .sum()
    .reset_index()
)
warbler_obs_by_park

In [None]:
colors = {'Protected': '#38b000', 'Unprotected': '#d00000'}

plt.figure(figsize = (8,6))
sns.barplot(x = 'park_name', y = 'observations', hue = 'protection_status', data = warbler_obs_by_park, palette = colors )

plt.title('Observation of Warblers by park')
plt.legend(title = 'Protection Status')
plt.xlabel('Park', labelpad = 10)
plt.xticks(rotation = 45)
plt.ylabel('Observations', labelpad = 15)

plt.show()

### Conclusions
- What is the distribution of conservation status for species?
    The vast majority of species were not part of conservation programs (5,350 vs 191 species with protection status).
- Are certain types of species more likely to be endangered?
    Birds and Mammals had the highest numbers of protected species, with Birds showing the highest percentage of species under protection relative to their category size.
- Are the differences between species and their conservation status significant?
    Yes, statistical testing revealed highly significant relationships (p-value: 3.88e-05 for conservation status by category and p-value: 5.51e-89 for protection status by taxonomic category).
- Which species group is most prevalent and what is their distribution amongst parks?
    The study found that Vascular Plants occurred most frequently, comprising 4,262 species (77% of total diversity), and Yellowstone National Park showed the highest observation numbers across all categories with 1.44 million observations.
