# Renewable Energy Data 

This Jupyter Notebook focuses on the cleaning and transformation of renewable energy data.


In [28]:
# Import Packages

In [29]:
import mechanicalsoup
import pandas as pd
import numpy as np
import random

In [30]:
# Load the dataset into a DataFrame

# Specify the file path
file_path = '1.renewable_energy_stage1.csv'

# Load the scraped data from the CSV file
df = pd.read_csv(file_path)

In [31]:
# Inspect the DataFrame
print("Initial DataFrame:")
print(df.head())


Initial DataFrame:
                Land  2018  2019
0  Europäische Union  18.0  18.8
1           Albanien  36.8  36.7
2            Belgien   9.4   9.9
3          Bulgarien  20.5  21.6
4           Dänemark  35.4  37.2


In [32]:
# Rename the 'Land' column to 'Country'
df.rename(columns={'Land': 'Country'}, inplace=True)

In [33]:
# Print number of missing values in each column
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
Country    0
2018       0
2019       0
dtype: int64


In [34]:
# Add Impurities 

In [35]:
# Create a list of random row indices to duplicate
num_duplicates = 20  # Adjust the number of duplicates as needed
duplicate_indices = random.choices(range(len(df)), k=num_duplicates)

# Append the selected duplicate rows to the DataFrame
duplicate_rows = df.iloc[duplicate_indices]
df = pd.concat([df, duplicate_rows], ignore_index=True)

In [36]:
# Changing Data types
def change_data_types(df, num_columns_to_change=3):
    """
    Randomly selects a specified number of columns and changes their data types to int, float, or str.
    The function skips columns with non-numeric values when changing to int or float types.
    """
    numeric_columns = df.select_dtypes(include=[float, int]).columns.tolist()
    non_numeric_columns = df.select_dtypes(exclude=[float, int]).columns.tolist()

    columns_to_change = random.sample(df.columns.tolist(), num_columns_to_change)
    changed_columns = []

    for column_name in columns_to_change:
        new_data_type = random.choice([int, float, str]) if column_name in numeric_columns else str

        if new_data_type == int:
            df[column_name] = df[column_name].astype(int)
        elif new_data_type == float:
            df[column_name] = df[column_name].astype(float)
        elif new_data_type == str:
            df[column_name] = df[column_name].astype(str)
        
        changed_columns.append((column_name, new_data_type.__name__))

    return df, changed_columns

# Applying the function to the DataFrame
df_with_changed_types, _ = change_data_types(df.copy(), num_columns_to_change=2)

# Print the head of the modified DataFrame
print(df_with_changed_types.head())

             Country  2018  2019
0  Europäische Union  18.0  18.8
1           Albanien  36.8  36.7
2            Belgien   9.4   9.9
3          Bulgarien  20.5  21.6
4           Dänemark  35.4  37.2


In [37]:
# Introducing an impurity by changing some random Country names to lowercase
lowercase_indices = random.sample(range(len(df)), 10)  # Randomly select 10 indices
df.loc[lowercase_indices, 'Country'] = df.loc[lowercase_indices, 'Country'].str.lower()

# Displaying the modified DataFrame to verify the changes
df.head()


Unnamed: 0,Country,2018,2019
0,Europäische Union,18.0,18.8
1,albanien,36.8,36.7
2,belgien,9.4,9.9
3,Bulgarien,20.5,21.6
4,Dänemark,35.4,37.2


In [38]:
# Adjusting the provided code to add '%' to random values in '2018' and '2019' columns
year_2018_indices = random.sample(range(len(df)), 5)  # Randomly select 10 indices for 2018
year_2019_indices = random.sample(range(len(df)), 5)  # Randomly select 10 indices for 2019

df.loc[year_2018_indices, '2018'] = df.loc[year_2018_indices, '2018'].astype(str) + "%"
df.loc[year_2019_indices, '2019'] = df.loc[year_2019_indices, '2019'].astype(str) + "%"

# Displaying the modified DataFrame to verify the changes
df.head()

  df.loc[year_2018_indices, '2018'] = df.loc[year_2018_indices, '2018'].astype(str) + "%"
  df.loc[year_2019_indices, '2019'] = df.loc[year_2019_indices, '2019'].astype(str) + "%"


Unnamed: 0,Country,2018,2019
0,Europäische Union,18.0,18.8
1,albanien,36.8%,36.7
2,belgien,9.4,9.9
3,Bulgarien,20.5,21.6%
4,Dänemark,35.4,37.2


In [39]:
# Save the updated DataFrame to a new CSV file
output_file_path = '2.renewable_energy_with_impurities_stage2.csv'
df.to_csv(output_file_path, index=False)

In [13]:
# Cleaning Data 

In [12]:
# Load the dataset into a DataFrame
# Specify the file path
file_path = '2.renewable_energy_with_impurities_stage2.csv'

# Load the scraped data from the CSV file
df = pd.read_csv(file_path)

In [14]:
# Print data types of each column
print("\nData types:")
print(df.dtypes)


Data types:
Country    object
2018       object
2019       object
dtype: object


In [15]:
# Changing the '2018' and '2019' columns to float data types
# This requires first removing any non-numeric characters (like the "%") from these columns
df['2018'] = df['2018'].replace('[^\d.]', '', regex=True).astype(float)
df['2019'] = df['2019'].replace('[^\d.]', '', regex=True).astype(float)

# Displaying the data types of the columns after the change
df.dtypes


Country     object
2018       float64
2019       float64
dtype: object

In [16]:
def ensure_countries_start_uppercase(df):
    """
    Ensures that country names in the 'Land' column start with an uppercase letter.
    Converts them to title case if they don't.
    """
    # Convert country names to title case if they don't start with an uppercase letter
    df['Country'] = df['Country'].apply(lambda x: x.title() if isinstance(x, str) and not x.istitle() else x)

    return df

# Applying the function to the DataFrame
df = ensure_countries_start_uppercase(df)

# Display the DataFrame to verify the changes
print("Country names after ensuring they start with an uppercase letter:")
print(sorted(df['Country'].unique()))

Country names after ensuring they start with an uppercase letter:
['Albanien', 'Belgien', 'Bulgarien', 'Deutschland', 'Dänemark', 'Estland', 'Europäische Union', 'Finnland', 'Frankreich', 'Griechenland', 'Irland', 'Island', 'Italien', 'Kosovo', 'Kroatien', 'Lettland', 'Litauen', 'Luxemburg', 'Malta', 'Niederlande', 'Polen', 'Portugal', 'Rumänien', 'Schweden', 'Slowakei', 'Slowenien', 'Spanien', 'Tschechien', 'Ungarn', 'Vereinigtes Königreich', 'Zypern', 'Österreich']


In [None]:
# Dictionary for translating German country names to English
german_to_english_countries = {
    "Europäische Union": "European Union",
    "Albanien": "Albania",
    "Belgien": "Belgium",
    "Bulgarien": "Bulgaria",
    "Dänemark": "Denmark",
    "Deutschland": "Germany",
    "Estland": "Estonia",
    "Finnland": "Finland",
    "Frankreich": "France",
    "Griechenland": "Greece",
    "Irland": "Ireland",
    "Island": "Iceland",
    "Italien": "Italy",
    "Kosovo": "Kosovo",
    "Kroatien": "Croatia",
    "Lettland": "Latvia",
    "Litauen": "Lithuania",
    "Luxemburg": "Luxembourg",
    "Malta": "Malta",
    "Niederlande": "Netherlands",
    "Österreich": "Austria",
    "Polen": "Poland",
    "Portugal": "Portugal",
    "Rumänien": "Romania",
    "Schweden": "Sweden",
    "Slowakei": "Slovakia",
    "Slowenien": "Slovenia",
    "Spanien": "Spain",
    "Tschechien": "Czech Republic",
    "Ungarn": "Hungary",
    "Vereinigtes Königreich": "United Kingdom",
    "Zypern": "Cyprus"
}


In [None]:
# Replace German country names with English names in the DataFrame
df['Country'] = df['Country'].replace(german_to_english_countries)
print(sorted(df['Country'].unique()))

In [None]:
# Transform df to have a 'Year' column
df = df.melt(id_vars=['Country'], var_name='Year', value_name='RenewableData')
df['Year'] = df['Year'].astype(int)


In [None]:
print(df.head())

In [None]:
# Check for Duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")


In [None]:
#Display basic information about the DataFrame
print("DataFrame Info:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print("\nNumber of Duplicate Rows:", duplicate_rows)

# Descriptive statistics for numerical columns
print("\nDescriptive Statistics for Numerical Columns:")
print(df.describe())

# Checking unique values for potential inconsistencies in categorical data
print("\nUnique Values in Categorical Columns:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"\nUnique values in {col}:")
    print(df[col].unique())

In [None]:
# Add aggregated  Average Renewable Energy Data Coulmn by Country

In [None]:
# Calculate the average renewable data across the years for each country
average_data = df.groupby('Country')['RenewableData'].mean().reset_index()
average_data['Average Renewable Data'] = average_data['RenewableData'].round(2) # round to 2 decimal places

# Merge the average data back into the original DataFrame
df = df.merge(average_data[['Country', 'Average Renewable Data']], on='Country', how='left')

# Show the first few rows of the DataFrame sorted by average renewable data
print(df.head())

In [40]:
# Save the updated DataFrame to a new CSV file
output_file_path = '3.cleaned_renewable_energy_stage3.csv'
df.to_csv(output_file_path, index=False)