In [4]:
import pandas as pd
import pycountry

In [5]:
# Load the greenhouse gas emissions data
df_greenhouse = pd.read_csv('https://raw.githubusercontent.com/Nico22724/Proyecto-_Medioambiental/refs/heads/main/Datasets/Subsets%20created/climate_change_greenhouse_data.csv')

In [6]:
# Load the historical CO2 data
df_data_history = pd.read_csv('https://raw.githubusercontent.com/Nico22724/Proyecto-_Medioambiental/refs/heads/main/Datasets/Subsets%20created/data_historica_co2.csv')

In [7]:
# Load the global CO2 emissions share data
df_global_emissions = pd.read_csv('https://raw.githubusercontent.com/Nico22724/Proyecto-_Medioambiental/refs/heads/main/Datasets/Subsets%20created/global_CO2_emissions_share_data.csv')

In [8]:
# Load the main CO2 data
df_co2_data = pd.read_csv("https://raw.githubusercontent.com/Nico22724/Proyecto-_Medioambiental/refs/heads/main/Datasets/owid-co2-data_cleaned.csv")

In [9]:
df_co2_data = df_co2_data.fillna(0)  # Replace NaN values with 0

In [10]:
# Function to clean data by filling NaN values and converting types
def clean_data(df_clean):
    # Replace NaN values in 'gdp' column with 0
    df_clean['gdp'] = df_clean['gdp'].fillna(0)
    # Convert 'gdp' column to integer type
    df_clean['gdp'] = df_clean['gdp'].fillna(0).astype(int)
    # Replace NaN values in 'population' column with 0
    df_clean['population'] = df_clean['population'].fillna(0)
    # Convert 'population' column to integer type
    df_clean['population'] = df_clean['population'].fillna(0).astype(int)
    return df_clean

# Clean the dataframes
df_global_emissions_clean = clean_data(df_global_emissions.copy())
df_data_history_clean = clean_data(df_data_history.copy())
df_greehouse_clean = clean_data(df_greenhouse.copy())
df_co2_data_clean = clean_data(df_co2_data.copy())

In [11]:
def clean_and_convert_range(df, start_idx, end_idx):
    for i in range(start_idx, end_idx + 1):
        col = df.columns[i]
        df[col] = df[col].fillna(0).astype(float)
    return df


In [12]:
# Find the position of the columns to fill NaN values the dataframe global emissions
start_col = df_global_emissions_clean.columns.get_loc("share_global_cement_co2")
end_col = df_global_emissions_clean.columns.get_loc("share_of_temperature_change_from_ghg")

# Find the position of the columns to fill NaN values in the greenhouse gas emissions dataframe
start_col_greenhouse = df_greehouse_clean.columns.get_loc("ghg_excluding_lucf_per_capita")
end_col_greenhouse = df_greehouse_clean.columns.get_loc("total_ghg_excluding_lucf")

# Find the position of the columns to fill NaN values in the historical CO2 data
star_col_history = df_data_history_clean.columns.get_loc("cumulative_co2")
end_col_history = df_data_history_clean.columns.get_loc("cumulative_other_co2")

In [13]:
# Clean and convert the specified range of columns in the global emissions dataframe
df_global_emissions_clean = clean_and_convert_range(df_global_emissions_clean, start_col, end_col)

In [14]:
# Clean and convert the specified range of columns in the greenhouse gas emissions dataframe
df_greehouse_clean = clean_and_convert_range(df_greehouse_clean, start_col_greenhouse, end_col_greenhouse)

In [15]:
# Clean and convert the specified range of columns in the historical CO2 data
df_data_history_clean = clean_and_convert_range(df_data_history_clean, star_col_history, end_col_history)

In [18]:
df_co2_data_clean.groupby('country')['consumption_co2'].sum().sort_values(ascending=False)

country
World                    1026192.957
High-income countries     537273.128
Non-OECD (GCP)            486510.404
OECD (GCP)                471568.761
Asia                      421283.808
                            ...     
Uzbekistan                     0.000
Vanuatu                        0.000
Vatican                        0.000
Wallis and Futuna              0.000
Yemen                          0.000
Name: consumption_co2, Length: 255, dtype: float64

In [21]:
# Filter the DataFrame to include only valid countries using pycountry
paises_validos = [country.name for country in pycountry.countries] # List of valid country names
def is_valid_country(df_country_clean):
    df_country_clean = df_country_clean[df_country_clean['country'].isin(paises_validos)] # Filter the DataFrame to include only valid countries
    return df_country_clean 

In [22]:
df_global_emissions_clean = is_valid_country(df_global_emissions_clean)
df_data_history_clean = is_valid_country(df_data_history_clean)
df_greehouse_clean = is_valid_country(df_greehouse_clean)
df_co2_data_clean = is_valid_country(df_co2_data_clean)  # Apply the function to the CO2 data

In [16]:
#df_global_emissions_clean
#df_co2_data_clean
#df_data_history_clean
#df_greehouse_clean

In [None]:
# Save the cleaned dataframes to CSV files
'''
df_global_emissions_clean.to_csv('C:\\Nicolas\\ProyectoMedioAmbiental\\ProyectoMedioAmbiental\\Proyecto-_Medioambiental\\Datasets\\Datasets Cleaned\\global_emissions_data.csv', index=False)
df_co2_data_clean.to_csv('C:\\Nicolas\\ProyectoMedioAmbiental\\ProyectoMedioAmbiental\\Proyecto-_Medioambiental\\Datasets\\Datasets Cleaned\\co2_data.csv', index=False)
df_data_history_clean.to_csv('C:\\Nicolas\\ProyectoMedioAmbiental\\ProyectoMedioAmbiental\\Proyecto-_Medioambiental\\Datasets\\Datasets Cleaned\\data_history.csv', index=False)
df_greehouse_clean.to_csv('C:\\Nicolas\\ProyectoMedioAmbiental\\ProyectoMedioAmbiental\\Proyecto-_Medioambiental\\Datasets\\Datasets Cleaned\\climate_change_greehouse_data.csv', index=False)
'''