In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import os
from IPython.display import display
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
#import plotly.io as pio
#pio.renderers.default = 'notebook'


In [None]:
file_path = "average-latitude-longitude-countries.csv"
dfA = pd.read_csv(file_path)

def categorize_latitude(latitude):
    if latitude > 23.5:
        return "North of Tropic of Cancer"
    elif 0 < latitude <= 23.5:
        return "Tropic of Cancer"
    elif latitude == 0:
        return "Equator"
    elif -23.5 <= latitude < 0:
        return "Tropic of Capricorn"
    else:
        return "South of Tropic of Capricorn"

dfA['Category'] = dfA['Latitude'].apply(categorize_latitude)

grouped = dfA.groupby('Category')['Country'].apply(list)

#for category, countries in grouped.items():
    #print(f"Category: {category}")
    #print(f"Number of countries: {len(countries)}")
    #print("Countries:", ", ".join(countries))
    #print("\n")

In [None]:
emissions = pd.read_csv('merged_processed_emissions_modified.csv')


In [None]:
def add_space_between_case_changes(text):
    return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
emissions['Country'] = emissions['Country'].apply(add_space_between_case_changes)

updates = {
    'United States': {'Latitude': 38.7946, 'Longitude': -106.5348},
    'Russia': {'Latitude': 61.5240, 'Longitude': 105.3188}
}
for country, coords in updates.items():
    emissions.loc[emissions['Country'] == country, ['Latitude', 'Longitude']] = [coords['Latitude'], coords['Longitude']]

In [None]:
gdpC = pd.read_excel('GDP by capita2.xls')
gdp = pd.read_excel('GDP2.xls')

In [None]:

gdp_renamed = gdp.rename(columns=lambda col: f'{str(col)}_gdp' if str(col).isdigit() else col)
gdp_renamed['Country Name'] = gdp_renamed['Country Name'].replace('Russian Federation', 'Russia')


In [None]:
unique_countries = gdp_renamed['Country Name'].unique()
print("Unique countries in the 'Country' column:")
#for country in unique_countries:
    #print(country)


In [None]:
unique_countries = emissions['Country'].unique()
print("Unique countries in the 'Country' column:")
#for country in unique_countries:
    #print(country)

In [None]:
lat_long_dict = {
    "European Union": (50.8503, 4.3517),  # Brussels, EU headquarters
    "Czechia": (49.8175, 15.4730),
    "British Virgin Islands": (18.4207, -64.6399),
    "Burkina Faso": (12.2383, -1.5616),
    "Cabo Verde": (16.5388, -23.0418),
    "Cayman Islands": (19.3133, -81.2546),
    "Central African Republic": (6.6111, 20.9394),
    "Dominican Republic": (18.7357, -70.1627),
    "El Salvador": (13.7942, -88.8965),
    "Equatorial Guinea": (1.6508, 10.2679),
    "Eswatini": (-26.5225, 31.4659),
    "Faroe Islands": (61.8926, -6.9118),
    "French Polynesia": (-17.6797, -149.4068),
    "Kosovo": (42.6026, 20.9020),
    "Libya": (26.3351, 17.2283),
    "Moldova": (47.4116, 28.3699),
    "New Caledonia": (-20.9043, 165.6180),
    "North Macedonia": (41.6086, 21.7453),
    "Northern Mariana Islands": (15.0979, 145.6739),
    "Papua New Guinea": (-6.314993, 143.95555),
    "Saudi Arabia": (23.8859, 45.0792),
    "Sierra Leone": (8.4606, -11.7799),
    "Solomon Islands": (-9.6457, 160.1562),
    "South Africa": (-30.5595, 22.9375),
    "South Sudan": (6.8770, 31.3070),
    "Sri Lanka": (7.8731, 80.7718),
    "Tanzania": (-6.3690, 34.8888),
    "Timor-Leste": (-8.8742, 125.7275),
    "United Arab Emirates": (23.4241, 53.8478),
    "American Samoa": (-14.2710, -170.1322),
    "Costa Rica": (9.7489, -83.7534),
    "New Zealand": (-40.9006, 174.8860),
    "Puerto Rico": (18.2208, -66.5901),
    "Turkiye": (38.9637, 35.2433),
    "United Kingdom": (55.3781, -3.4360)
}

def update_lat_long(row):
    country = row['Country']
    if country in lat_long_dict:
        row['Latitude'], row['Longitude'] = lat_long_dict[country]
    return row

emissions = emissions.apply(update_lat_long, axis=1)

In [None]:
merged_df = pd.merge(emissions, gdp_renamed, left_on='Country', right_on='Country Name')

years = [str(year) for year in range(2000, 2022)]

for year in years:
    emissions_col = year   
    gdp_col = f'{year}_gdp'  
    
    if emissions_col in merged_df.columns and gdp_col in merged_df.columns:
        merged_df[emissions_col] = pd.to_numeric(merged_df[emissions_col], errors='coerce')
        merged_df[gdp_col] = pd.to_numeric(merged_df[gdp_col], errors='coerce')

        ratio_col = f'ratio_{year}'  
        merged_df[ratio_col] = merged_df[gdp_col] / merged_df[emissions_col]

valid_ratio_columns = [f'ratio_{year}' for year in years]

columns_to_include = ['Country', 'Type of Emission'] + valid_ratio_columns
if 'Latitude' in merged_df.columns:
    columns_to_include.append('Latitude')
if 'Longitude' in merged_df.columns:
    columns_to_include.append('Longitude')

filtered_df = merged_df.dropna(subset=['Latitude', 'Longitude'])

new_dataset_with_ratios = filtered_df[columns_to_include].dropna(how='all', subset=valid_ratio_columns)

#new_dataset_with_ratios.to_csv('new_dataset_with_ratios.csv', index=False)

new_dataset_with_ratios

In [None]:
#SEEMS COOL BUT NEED MORE WORK ON IT 

correlation = new_dataset_with_ratios[['ratio_2019', 'Latitude', 'Longitude']].corr()

# Create heatmap of correlations
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt=".2f")
plt.title('Correlation of Ratios with Latitude and Longitude (2022)')
plt.show()

In [None]:
supported_countries = px.data.gapminder()['country'].unique()

print("Supported countries for px.choropleth:")
#for country in sorted(supported_countries):
    #print(country)

In [None]:
country_list = [
    'Cyprus', 'Estonia', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 
    'Armenia', 'Aruba', 'Azerbaijan', 'Barbados', 'Belarus', 'Belize', 
    'Bermuda', 'Bhutan', 'Dominica', 'Fiji', 'Georgia', 'Greenland', 
    'Grenada', 'Guyana', 'Kazakhstan', 'Kiribati', 'Maldives', 'Nauru', 
    'Qatar', 'Russia', 'Samoa', 'Seychelles', 'Suriname', 'Tajikistan', 
    'Tonga', 'Turkmenistan', 'Tuvalu', 'Ukraine', 'Uzbekistan', 'Vanuatu', 'Guam'
]

emission_countries = emissions['Country'].unique()
countries_in_emissions = [country for country in country_list if country in emission_countries]
#print("Countries from the list found in the 'emissions' dataset:")
#for country in countries_in_emissions:
    #print(country)
countries_not_in_emissions = [country for country in country_list if country not in emission_countries]

#print("\nCountries from the list NOT found in the 'emissions' dataset:")
#for country in countries_not_in_emissions:
    #print(country)

In [None]:
merged_countries = merged_df['Country Name'].unique()

countries_in_merged = [country for country in country_list if country in merged_countries]
#print("Countries from the list found in the 'merged_df' dataset:")
#for country in countries_in_merged:
    #print(country)
countries_not_in_merged = [country for country in country_list if country not in merged_countries]

#print("\nCountries from the list NOT found in the 'merged_df' dataset:")
#for country in countries_not_in_merged:
    #print(country)

In [None]:
emission_types = [
    '        CO2 emissions (MMtonnes CO2)',
    '            Coal and coke (MMtonnes CO2)',
    '            Consumed natural gas (MMtonnes CO2)',
    '            Petroleum and other liquids (MMtonnes CO2)'
]

color_ranges = {
    '        CO2 emissions (MMtonnes CO2)': (0, 9000000000),  # Example range
    '            Coal and coke (MMtonnes CO2)': (0, 50000000000),  # Example range
    '            Consumed natural gas (MMtonnes CO2)': (0, 30000000000),  # Example range
    '            Petroleum and other liquids (MMtonnes CO2)': (0, 9000000000)  # Example range
}

for emission in emission_types:
    filtered_df = new_dataset_with_ratios[new_dataset_with_ratios['Type of Emission'] == emission]

    melted_df = filtered_df.melt(id_vars=['Country', 'Type of Emission'], 
                                 var_name='Year', 
                                 value_name='Ratio')

    melted_df['Year'] = melted_df['Year'].str.extract(r'(\d{4})')

    melted_df = melted_df.dropna(subset=['Year'])

    melted_df['Year'] = melted_df['Year'].astype(int)

    cmin, cmax = color_ranges[emission] 

    fig = px.choropleth(melted_df, 
                        locations="Country", 
                        locationmode='country names',  
                        color="Ratio", 
                        hover_name="Country", 
                        animation_frame="Year",  
                        title=f"Choropleth Map of {emission.strip()} (2000-2021)",
                        color_continuous_scale=px.colors.sequential.Plasma)

    fig.update_coloraxes(cmin=cmin, cmax=cmax, colorbar_title="Emissions-to-GDP Ratio")

    fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="lightgray")

    fig.update_layout(
        width=1200,  
        height=800,  
    )

    fig.show()

In [None]:
#dfE = pd.read_csv("merged_processed_emissions_modified.csv")
#print(emissions['Type of Emission'].unique())  # Show unique values in the 'Type of Emission' column
#print(emissions['Country'].unique())

In [None]:
unique_emissions = emissions['Type of Emission'].unique()
print(unique_emissions)


In [None]:
co2_df = emissions[emissions['Type of Emission'].str.strip() == 'CO2 emissions (MMtonnes CO2)']

years = [str(year) for year in range(1980, 2023)]

co2_df.loc[:, years] = co2_df[years].apply(pd.to_numeric, errors='coerce')

co2_df = co2_df.drop_duplicates(subset=['Country'])

co2_df.set_index('Country', inplace=True)
total_emissions = co2_df[years].sum(axis=1)

sorted_emissions = total_emissions.sort_values(ascending=False)

top_20 = sorted_emissions.head(20)
bottom_20 = sorted_emissions.tail(20)

middle_index = len(sorted_emissions) // 2
if len(sorted_emissions) < 20:
    middle_20 = sorted_emissions
else:
    middle_20 = sorted_emissions.iloc[middle_index - 5: middle_index + 5]

print("Top 20 countries with highest CO2 emissions:")
print(top_20)
print("\nBottom 20 countries with lowest CO2 emissions:")
print(bottom_20)
print("\nMiddle 20 countries with CO2 emissions:")
print(middle_20)

# New

In [None]:
#ALL FILES WITH ONLY POSITIVES VALUES (YEAR TO YEAR)

file_names = [
    "merged_processed_Coal and coke.csv",
    "merged_processed_Natural gas.csv",
    "merged_processed_Petroleum and other liquids.csv",
    "merged_processed_Biofuels.csv",
    "merged_processed_Electricity.csv",
    "merged_modified2_primary_energy.csv",
    "merged_processed_emissions_modified.csv"
]

years_range = [str(year) for year in range(2012, 2023)]

for file_name in file_names:
    df = pd.read_csv(file_name)
    
    for year in years_range:
        if year in df.columns:
            df[year] = pd.to_numeric(df[year], errors='coerce')
    
    for i in range(1, len(years_range)):
        current_year = years_range[i]
        previous_year = years_range[i - 1]
        percentage_change_column = f'% change {previous_year} to {current_year}'
        
        df.loc[:, percentage_change_column] = (
            (df[current_year] - df[previous_year]) / df[previous_year]
        ) * 100
    
    change_columns = [col for col in df.columns if col.startswith('% change')]
    df.replace([float('inf'), -float('inf')], np.nan, inplace=True)  
    df.dropna(subset=change_columns, inplace=True) 
    
    all_positive = df[change_columns].gt(0).all(axis=1)
    df = df[all_positive]
    
    df = df.drop_duplicates()
    

    print(f"Filtered unique rows with all positive % change in the dataset '{file_name}':")
    display(df)
    print("\n" + "-"*50 + "\n")

In [None]:
#only positive change year to year 

years_range = [str(year) for year in range(2012, 2023)]

for file_name in file_names:
    df = pd.read_csv(file_name)
    for year in years_range:
        if year in df.columns:
            df[year] = pd.to_numeric(df[year], errors='coerce')
    for i in range(1, len(years_range)):
        current_year = years_range[i]
        previous_year = years_range[i - 1]
        percentage_change_column = f'% change {previous_year} to {current_year}'
        
        df[percentage_change_column] = (
            (df[current_year] - df[previous_year]) / df[previous_year]
        ) * 100

    change_columns = [col for col in df.columns if col.startswith('% change')]
    df.replace([float('inf'), -float('inf')], np.nan, inplace=True)  
    df.dropna(subset=change_columns, inplace=True) 
    
    all_positive = df[change_columns].gt(0).all(axis=1)
    df = df[all_positive]
    
    df = df.drop_duplicates()
    
    melted_df = df.melt(id_vars=['Country'], 
                        value_vars=years_range, 
                        var_name='Year', 
                        value_name='Value')
    
    melted_df['Year'] = melted_df['Year'].astype(int)
    
    fig = px.choropleth(melted_df, 
                        locations="Country", 
                        locationmode='country names',  
                        color="Value", 
                        hover_name="Country", 
                        animation_frame="Year",  
                        title=f"Choropleth Map of Values Over Time ({file_name})",
                        color_continuous_scale=px.colors.sequential.Plasma)
    
    fig.update_coloraxes(cmin=melted_df['Value'].min(), 
                         cmax=melted_df['Value'].max(), 
                         colorbar_title="Value")
    
    fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="lightgray")
    
    fig.update_layout(
        width=1200,  
        height=800,  
    )
    
    # Show the plot
    fig.show()

In [None]:
#how value changes from year to year
years_range = [str(year) for year in range(2012, 2023)]

for file_name in file_names:
    df = pd.read_csv(file_name)

    for year in years_range:
        if year in df.columns:
            df[year] = pd.to_numeric(df[year], errors='coerce')
            
    for i in range(1, len(years_range)):
        current_year = years_range[i]
        previous_year = years_range[i - 1]
        percentage_change_column = f'% change {previous_year} to {current_year}'
        
        df.loc[:, percentage_change_column] = (
            (df[current_year] - df[previous_year]) / df[previous_year]
        ) * 100
    df = df.drop_duplicates()
    
    print(f"Head of the dataset '{file_name}':")
    display(df.head())
    print("\n" + "-"*50 + "\n")
    
    #output_file_name = f"updated_{file_name}"
    #df.to_csv(output_file_name, index=False)
    #print(f"Processed and saved: {output_file_name}\n")
    

In [None]:
#GREATER THAN 100 ALL FILES (based on 1980)

import numpy as np  

years = [str(year) for year in range(2013, 2023)]

for file_name in file_names:
    df = pd.read_csv(file_name)
    
    filtered_df = df.copy()
    
    for year in years:
        if year in filtered_df.columns:
            filtered_df[year] = pd.to_numeric(filtered_df[year], errors='coerce')
    if '1980' in filtered_df.columns:
        filtered_df['1980'] = pd.to_numeric(filtered_df['1980'], errors='coerce')

    for year in years:
        if '1980' in filtered_df.columns and year in filtered_df.columns:
            change_col = f'% change of {year}'
            filtered_df.loc[:, change_col] = (filtered_df[year] / filtered_df['1980']) * 100

    change_columns = [col for col in filtered_df.columns if col.startswith('% change')]

    filtered_df.replace([float('inf'), -float('inf')], np.nan, inplace=True)  
    filtered_df.dropna(subset=change_columns, inplace=True) 
 
    filtered_df = filtered_df[filtered_df[change_columns].gt(100).any(axis=1)]
    
    print(f"Filtered rows with % change > 100 in the dataset '{file_name}':")
    display(filtered_df)
    print("\n" + "-"*50 + "\n") 
    
    
    #output_file_name = f"updated_based_on_1980_{file_name}"
    #filtered_df.to_csv(output_file_name, index=False)
    #print(f"Processed and saved: {output_file_name}\n")

In [None]:
#rows where at least one percentage change value exceeds 100% based on 1980

years = [str(year) for year in range(2013, 2023)]

# Define file names
file_names = [
    "merged_processed_Coal and coke.csv",
    "merged_processed_Natural gas.csv",
    "merged_processed_Petroleum and other liquids.csv",
    "merged_processed_Biofuels.csv",
    "merged_processed_Electricity.csv",
    "merged_modified2_primary_energy.csv",
    "merged_processed_emissions_modified.csv"
]

for file_name in file_names:
    df = pd.read_csv(file_name)
    
    for year in years:
        if year in df.columns:
            df[year] = pd.to_numeric(df[year], errors='coerce')
    
    if '1980' in df.columns:
        df['1980'] = pd.to_numeric(df['1980'], errors='coerce')
    
    #percentage change from 1980
    for year in years:
        if '1980' in df.columns and year in df.columns:
            change_col = f'% change of {year}'
            df[change_col] = (df[year] / df['1980']) * 100
    
    #  % change > 100
    change_columns = [col for col in df.columns if col.startswith('% change of ')]
    df.replace([float('inf'), -float('inf')], np.nan, inplace=True)
    df.dropna(subset=change_columns, inplace=True)
    
    # percentage change is > 100
    filtered_rows = df[change_columns].apply(lambda x: x > 100, axis=1)
    filtered_df = df.loc[filtered_rows.any(axis=1)]
    
    melted_df = filtered_df.melt(id_vars=['Country'], 
                                 value_vars=years, 
                                 var_name='Year', 
                                 value_name='Value')

    melted_df['Year'] = melted_df['Year'].astype(int)

    fig = px.choropleth(melted_df, 
                        locations="Country", 
                        locationmode='country names',  
                        color="Value", 
                        hover_name="Country", 
                        animation_frame="Year",  
                        title=f"Choropleth Map of Values Over Time ({file_name})",
                        color_continuous_scale=px.colors.sequential.Plasma)

    fig.update_coloraxes(cmin=melted_df['Value'].min(), 
                         cmax=melted_df['Value'].max(), 
                         colorbar_title="Value")

    fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="lightgray")

    fig.update_layout(
        width=1200, 
        height=800,  
    )

    fig.show()

# FAILURES SECTION

In [None]:
coal = pd.read_csv('merged_processed_Coal and coke.csv')
latitude_col = 'Latitude_x'  
quantity_col = '2022'     

coal[quantity_col] = pd.to_numeric(coal[quantity_col], errors='coerce')
df_filtered = coal[(coal[quantity_col].notna()) & (coal[quantity_col] > 0)]

df_filtered = df_filtered.sort_values(by=quantity_col)

fig = px.scatter(df_filtered, x=latitude_col, y=quantity_col,
                 title='Coal and Coke Quantity vs. Latitude in 2022',
                 labels={latitude_col: 'Latitude', quantity_col: 'Quantity (2022)'})


fig.show()

In [None]:
df_filtered

In [None]:
df_coal2 = pd.read_csv('merged_processed_Coal and coke.csv')
df_natural_gas = pd.read_csv('merged_processed_Natural gas.csv')
df_petroleum = pd.read_csv('merged_processed_Petroleum and other liquids.csv')

latitude_col = 'Latitude' 
quantity_col = '2022'      


def preprocess_data(df, source_name):

    df[quantity_col] = pd.to_numeric(df[quantity_col], errors='coerce')
    
    df_filtered = df[(df[quantity_col].notna()) & (df[quantity_col] > 0)].copy()
    
    df_filtered['Source'] = source_name
    
    return df_filtered

df_coal_filtered = preprocess_data(df_coal2, 'Coal and Coke')
df_natural_gas_filtered = preprocess_data(df_natural_gas, 'Natural Gas')
df_petroleum_filtered = preprocess_data(df_petroleum, 'Petroleum and Other Liquids')


df_combined = pd.concat([df_coal_filtered, df_natural_gas_filtered, df_petroleum_filtered])

df_combined = df_combined.sort_values(by=quantity_col)

fig = px.scatter(df_combined, x=latitude_col, y=quantity_col, color='Source',
                 title='Natural Resource Quantity vs. Latitude in 2022',
                 labels={latitude_col: 'Latitude', quantity_col: 'Quantity (2022)', 'Source': 'Resource Type'})

fig.show()

In [None]:

latitude_col = 'Latitude' 
quantity_col = '2022'  

def preprocess_data(df, source_name):
    df[quantity_col] = pd.to_numeric(df[quantity_col], errors='coerce')
    
    df_filtered = df[(df[quantity_col].notna()) & (df[quantity_col] > 0)].copy()
    
    df_filtered.loc[:, 'Source'] = source_name
    
    return df_filtered


df_coal_filtered = preprocess_data(df_coal2, 'Coal and Coke')
df_natural_gas_filtered = preprocess_data(df_natural_gas, 'Natural Gas')
df_petroleum_filtered = preprocess_data(df_petroleum, 'Petroleum and Other Liquids')


df_combined = pd.concat([df_coal_filtered, df_natural_gas_filtered, df_petroleum_filtered])

def calculate_percentile_range(df, col, lower_percentile=5, upper_percentile=95):
    lower_bound = df[col].quantile(lower_percentile / 100)
    upper_bound = df[col].quantile(upper_percentile / 100)
    return lower_bound, upper_bound


x_min, x_max = calculate_percentile_range(df_combined, latitude_col)
y_min, y_max = calculate_percentile_range(df_combined, quantity_col)


fig = px.scatter(df_combined, x=latitude_col, y=quantity_col, color='Source',
                 title='Natural Resource Quantity vs. Latitude in 2022',
                 labels={latitude_col: 'Latitude', quantity_col: 'Quantity (2022)', 'Source': 'Resource Type'})


fig.update_layout(
    xaxis=dict(range=[x_min, x_max]),
    yaxis=dict(range=[y_min, y_max])
)


fig.show()

In [None]:
latitude_col = 'Latitude'  
longitude_col = 'Longitude' 
quantity_col = '2022'       

def preprocess_data(df, source_name):
    df[quantity_col] = pd.to_numeric(df[quantity_col], errors='coerce')
    
    df_filtered = df[(df[quantity_col].notna()) & (df[quantity_col] > 0)].copy()
    
    df_filtered.loc[:, 'Source'] = source_name
    
    return df_filtered

df_coal_filtered = preprocess_data(df_coal2, 'Coal and Coke')
df_natural_gas_filtered = preprocess_data(df_natural_gas, 'Natural Gas')
df_petroleum_filtered = preprocess_data(df_petroleum, 'Petroleum and Other Liquids')


df_combined = pd.concat([df_coal_filtered, df_natural_gas_filtered, df_petroleum_filtered])


def calculate_percentile_range(df, col, lower_percentile=5, upper_percentile=95):
    lower_bound = df[col].quantile(lower_percentile / 100)
    upper_bound = df[col].quantile(upper_percentile / 100)
    return lower_bound, upper_bound

x_min_lat, x_max_lat = calculate_percentile_range(df_combined, latitude_col)
x_min_lon, x_max_lon = calculate_percentile_range(df_combined, longitude_col)
y_min, y_max = calculate_percentile_range(df_combined, quantity_col)


fig = px.scatter(df_combined, x=longitude_col, y=quantity_col, color='Source',
                 title='Natural Resource Quantity vs. Longitude in 2022',
                 labels={longitude_col: 'Longitude', quantity_col: 'Quantity (2022)', 'Source': 'Resource Type'})

fig.update_layout(
    xaxis=dict(range=[x_min_lon, x_max_lon]),
    yaxis=dict(range=[y_min, y_max])
)

fig.show()

In [None]:
files = [
    'merged_processed_Coal and coke.csv',
    'merged_processed_Natural gas.csv',
    'merged_processed_Petroleum and other liquids.csv',
    'merged_processed_Biofuels.csv',
    'merged_processed_Electricity.csv',
    'merged_modified2_primary_energy.csv',"merged_processed_emissions_modified.csv"
]

latitude_col = 'Latitude' 
quantity_col = '2022'      

def preprocess_data(df, source_name):
    if quantity_col not in df.columns:
        print(f"Warning: '{quantity_col}' column not found in dataset '{source_name}'")
        return pd.DataFrame()  
    
    df[quantity_col] = pd.to_numeric(df[quantity_col], errors='coerce')
    

    df_filtered = df[(df[quantity_col].notna()) & (df[quantity_col] > 0)].copy()
    
    df_filtered.loc[:, 'Source'] = source_name
    
    return df_filtered

dfs_filtered = []
for file in files:
    source_name = file.split('.')[0] 
    df = pd.read_csv(file)
    df_filtered = preprocess_data(df, source_name)
    if not df_filtered.empty:
        dfs_filtered.append(df_filtered)

if dfs_filtered:
    df_combined = pd.concat(dfs_filtered)
    
    def calculate_percentile_range(df, col, lower_percentile=5, upper_percentile=95):
        lower_bound = df[col].quantile(lower_percentile / 100)
        upper_bound = df[col].quantile(upper_percentile / 100)
        return lower_bound, upper_bound

    x_min, x_max = calculate_percentile_range(df_combined, latitude_col)
    y_min, y_max = calculate_percentile_range(df_combined, quantity_col)

    fig = px.scatter(df_combined, x=latitude_col, y=quantity_col, color='Source',
                     title='Natural Resource Quantity vs. Latitude in 2022',
                     labels={latitude_col: 'Latitude', quantity_col: 'Quantity (2022)', 'Source': 'Resource Type'})

    fig.update_layout(
        xaxis=dict(range=[x_min, x_max]),
        yaxis=dict(range=[y_min, y_max])
    )

    fig.show()
else:
    print("No data available for plotting.")

In [None]:
longitude_col = 'Longitude'  
quantity_col = '2022'        

def preprocess_data(df, source_name):
    if quantity_col not in df.columns:
        print(f"Warning: '{quantity_col}' column not found in dataset '{source_name}'")
        return pd.DataFrame()  
    
    df[quantity_col] = pd.to_numeric(df[quantity_col], errors='coerce')
    
    df_filtered = df[(df[quantity_col].notna()) & (df[quantity_col] > 0)].copy()
    
    df_filtered.loc[:, 'Source'] = source_name
    
    return df_filtered

dfs_filtered = []
for file in files:
    source_name = file.split('.')[0] 
    df = pd.read_csv(file)
    df_filtered = preprocess_data(df, source_name)
    if not df_filtered.empty:
        dfs_filtered.append(df_filtered)

if dfs_filtered:
    df_combined = pd.concat(dfs_filtered)
    
    def calculate_percentile_range(df, col, lower_percentile=5, upper_percentile=95):
        lower_bound = df[col].quantile(lower_percentile / 100)
        upper_bound = df[col].quantile(upper_percentile / 100)
        return lower_bound, upper_bound

    x_min, x_max = calculate_percentile_range(df_combined, longitude_col)
    y_min, y_max = calculate_percentile_range(df_combined, quantity_col)

    fig = px.scatter(df_combined, x=longitude_col, y=quantity_col, color='Source',
                     title='Natural Resource Quantity vs. Longitude in 2022',
                     labels={longitude_col: 'Longitude', quantity_col: 'Quantity (2022)', 'Source': 'Resource Type'})

    fig.update_layout(
        xaxis=dict(range=[x_min, x_max]),
        yaxis=dict(range=[y_min, y_max])
    )

    fig.show()
else:
    print("No data available for plotting.")

In [None]:
dfT= pd.read_csv("merged_processed_emissions_modified.csv")
dfT