In [None]:
# Library imports
import numpy as np
import pandas as pd

# Import the CSVs using pandas
co2_data = pd.read_csv('./co2_by_country.csv')
crocodiles_data = pd.read_csv('./crocodile_dataset.csv')
energy_data = pd.read_csv('./global_energy.csv')

# Define a country map so that all three datasets are able to communicate
country_map = {
    # Energy dataset
    'UK': 'United Kingdom',
    'USA': 'United States',
    'Russia': 'Russia',
    'China': 'China',
    'Japan': 'Japan',
    'Brazil': 'Brazil',
    'Germany': 'Germany',
    'India': 'India',
    'Australia': 'Australia',
    'Canada': 'Canada',

    # Crocodile dataset
    'USA (Florida)': 'United States',
    'Congo (DRC)': 'Democratic Republic Of Congo',
    'Congo Basin Countries': 'Republic Of Congo',
    "CÃ´te d'Ivoire": 'Ivory Coast',
    'Iran (historic)': 'Iran',
    'Indonesia (Borneo)': 'Indonesia',
    'Indonesia (Papua)': 'Indonesia',
    'Malaysia (Borneo)': 'Malaysia',
}

# Normalize the country names for all datasets
crocodiles_data['Country/Region'] = crocodiles_data['Country/Region'].replace(country_map)
energy_data['Country'] = energy_data['Country'].replace(country_map)

# Create the dataset to be populated
combined = pd.DataFrame()

# Add relevant data from the crocodile dataset
combined['Year'] = crocodiles_data['Date of Observation'].astype(str).str.split('-').str[2].astype(int)
combined['Scientific Name'] = crocodiles_data['Scientific Name']
combined['Country'] = crocodiles_data['Country/Region']
combined['Conservation Status'] = crocodiles_data['Conservation Status']

# Add relevant data from the co2 dataset
# Separate the year from the data column. Store it is a new column
co2_data['Year'] = pd.to_datetime(co2_data['Date']).dt.year

# Merges the data based on shared country and year columns
combined = combined.merge(
    co2_data[['Country', 'Year', 'Kilotons of Co2', 'Metric Tons Per Capita']], 
    on=['Country', 'Year'], 
    how='left'
)


# Add relevant data from the global energy dataset
#combined = combined.merge(
#    energy_data[['Country', 'Year', 'Total Energy Consumption (TWh)', 'Renewable Energy Share (%)', 'Fossil Fuel Dependency (%)']],
#    on=['Country', 'Year'],
#    how='left'
#)

# Remove entries that are missing data and remove duplicate entries
combined = combined[combined['Conservation Status'] != 'Data Deficient']
combined = combined.dropna()
combined = combined.drop_duplicates(subset=['Year', 'Scientific Name', 'Country'])

# Merge into a CSV
combined.to_csv('combined_2_datasets.csv', index=False)

#print(combined)
