## Project Overview

This project explores global renewable energy consumption patterns using World Bank data. The goal is to apply machine learning techniques to analyze, cluster, and eventually optimize renewable energy distribution across countries and regions

## Data Import

In [None]:
# importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load and inspect the data
# The World Bank data is available at: https://data.worldbank.org/indicator/EG.FEC.RNEW.ZS

file_name = 'data/API_EG.FEC.RNEW.ZS_DS2_en_csv_v2_13732.csv'

# World Bank data has extra rows we need to skip 
df = pd.read_csv(file_name, skiprows=4)
df.head()

## Exploring the data

We should check:

- Missing values

- Column names

- Data types

In [None]:
# for getting an overview of what the dataframe entails (datatype, non-null values)
print("Shape:", df.shape)
df.info()

df.isnull().sum().head()

In [None]:
# Plot top countries (latest year)
df_latest = df[["Country Name", "2020"]].dropna().sort_values(by="2020", ascending=False).head(10)
sns.barplot(x="2020", y="Country Name", data=df_latest)
plt.title("Top 10 Countries by Renewable Energy % (2020)")
plt.xlabel("% of Total Energy Consumption")
plt.ylabel("Country")
plt.show()

## Data Cleaning

In [None]:
# Re-inspect Null Values

# check total missing values by column
missing_per_year = df.isnull().sum().iloc[5:]
missing_per_year[missing_per_year > 0].plot(kind='bar', figsize=(15, 4))
plt.title('Missing values per year')
plt.ylabel('Count')
plt.xlabel('Year')
plt.tight_layout()
plt.show()

In [None]:
# Let's Focus on Relevant Years
# filtering from 2000 to 2022 (good for recent trends)

years = [str(y) for y in range(2000, 2021)]
df_years = df[['Country Name', 'Country Code'] + years]

# Removing countries with with more than 50% missing values across years
df_cleaned_new = df_years.dropna(thresh = len(years) // 2 + 1)
print("Remaining countries:", df_cleaned_new.shape[0])

# Alternatively

threshold = int(len(df_years) * 0.05)
#print(threshold)

cols_to_drop = df_years.columns[df_years.isna().sum() <= threshold]
#print(cols_to_drop)

df_cleaned = df_years.dropna(subset=cols_to_drop)
print('Remaining countries:', df_cleaned.shape[0])
print(df_cleaned_new.isnull().sum())

In [None]:
# Filling the missing values 
# Imputing a summary statistics

df_imputed_new = df_cleaned_new.copy()
df_imputed_new[years] = df_imputed_new[years].apply(lambda row: row.fillna(row.mean()), axis=1)
print(df_imputed_new.isna().sum())

# Initial Visualization of the data