In [2]:
import os
import sqlite3
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

# Dataset is pulled from Kaggle, the credentials need to be provided.
api = KaggleApi()
api.authenticate()

dataset1 = 'chiticariucristian/deforestation-and-forest-loss'
dataset2 = 'ulrikthygepedersen/co2-emissions-by-country'

data_dir = '../data'
response_csv = api.dataset_download_files(dataset1, path=data_dir, unzip=True)
response_csv = api.dataset_download_files(dataset2, path=data_dir, unzip=True)
print("CSV files downloaded and unzipped successfully. Converting to a database...")
csv_file1 = os.path.join(data_dir, 'annual-change-forest-area.csv')
csv_file2 = os.path.join(data_dir, 'co2_emissions_kt_by_country.csv')

try:
    deforestation_df = pd.read_csv(csv_file1)
    co2_emissions_df = pd.read_csv(csv_file2)
except Exception as e:
    print(f"Error occured: {e}")



Dataset URL: https://www.kaggle.com/datasets/chiticariucristian/deforestation-and-forest-loss
Dataset URL: https://www.kaggle.com/datasets/ulrikthygepedersen/co2-emissions-by-country
CSV files downloaded and unzipped successfully. Converting to a database...


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data cleaning and transformation
# Renaming the columns correctly based on the actual column names
co2_emissions_df.rename(columns={"country_name": "Country", "year": "Year", "value": "CO2 Emissions (kt)"}, inplace=True)

# Filter the data to include only the necessary columns
deforestation_df = deforestation_df[["Entity", "Year", "Net forest conversion"]]
co2_emissions_df = co2_emissions_df[["Country", "Year", "CO2 Emissions (kt)"]]

# Merge the datasets on Country and Year
merged_df = pd.merge(deforestation_df, co2_emissions_df, on=["Country", "Year"], how="inner")

# Visualize the relationship between deforestation and CO2 emissions
plt.figure(figsize=(12, 6))
sns.scatterplot(data=merged_df, x="Net forest conversion", y="CO2 Emissions (kt)")
plt.title("Relationship between Deforestation and CO2 Emissions")
plt.xlabel("Net Forest Conversion")
plt.ylabel("CO2 Emissions (kt)")
plt.show()

# Calculate the correlation between deforestation and CO2 emissions
correlation = merged_df["Net forest conversion"].corr(merged_df["CO2 Emissions (kt)"])
correlation

KeyError: "['Country'] not in index"

In [None]:
# Additional visualizations and insights

# Distribution of Net Forest Conversion
plt.figure(figsize=(12, 6))
sns.histplot(merged_df["Net forest conversion"], kde=True)
plt.title("Distribution of Net Forest Conversion")
plt.xlabel("Net Forest Conversion")
plt.ylabel("Frequency")
plt.show()

# Distribution of CO2 Emissions
plt.figure(figsize=(12, 6))
sns.histplot(merged_df["CO2 Emissions (kt)"], kde=True)
plt.title("Distribution of CO2 Emissions")
plt.xlabel("CO2 Emissions (kt)")
plt.ylabel("Frequency")
plt.show()

# Relationship by Country
# Taking the top 10 countries with the highest absolute net forest conversion
top_countries = merged_df.groupby('Country')['Net forest conversion'].sum().abs().nlargest(10).index
top_countries_df = merged_df[merged_df['Country'].isin(top_countries)]

plt.figure(figsize=(15, 8))
sns.lineplot(data=top_countries_df, x="Year", y="Net forest conversion", hue="Country", marker='o')
plt.title("Net Forest Conversion Over Time for Top 10 Countries")
plt.xlabel("Year")
plt.ylabel("Net Forest Conversion")
plt.legend(title="Country")
plt.show()

plt.figure(figsize=(15, 8))
sns.lineplot(data=top_countries_df, x="Year", y="CO2 Emissions (kt)", hue="Country", marker='o')
plt.title("CO2 Emissions Over Time for Top 10 Countries")
plt.xlabel("Year")
plt.ylabel("CO2 Emissions (kt)")
plt.legend(title="Country")
plt.show()

# Correlation matrix to explore other potential relationships
correlation_matrix = merged_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

import ace_tools as tools; tools.display_dataframe_to_user(name="Merged DataFrame", dataframe=merged_df)

# Displaying the merged dataframe for further inspection
merged_df.head()