In [2]:
# Import required libraries and dependencies
import pandas as pd

In [None]:
# Load the data into a Pandas DataFrame
df_energy = pd.read_csv(
    "Resources/global-data-on-sustainable-energy (1).csv")

In [None]:
df_energy.head()

### Clean Data and Rename some Columns

In [None]:
# Remove commas from the 'Land' column
df_energy['Land Area(Km2)'] = df_energy['Land Area(Km2)'].str.replace(',', '').astype(float)

In [None]:
df_energy.rename(columns={'Entity': 'Country', 'Density\n(P/Km2)': 'Population Density', 'Land Area(Km2)': 'Land'}, inplace=True)

In [None]:
df_energy.info()

In [None]:
df_energy_2000 = df_energy[df_energy["Year"] == 2000]
df_energy_2020 = df_energy[df_energy["Year"] == 2020]

In [None]:
# Load the data into a Pandas DataFrame
df_energy_data = pd.read_csv(
    "Resources/global-data-on-sustainable-energy (1).csv",
    index_col="Entity")

# Display sample data
df_energy_data.head(10)

In [None]:
# Generate summary statistics
df_energy_data.describe()

Check for Nulls on the columns that are from interest

In [None]:
print(df_energy_data["Electricity from fossil fuels (TWh)"].isnull().sum())
print(df_energy_data["Electricity from nuclear (TWh)"].isnull().sum())
print(df_energy_data["Electricity from renewables (TWh)"].isnull().sum())


Drop Nulls

In [None]:
columns_to_clean = ["Electricity from fossil fuels (TWh)", "Electricity from nuclear (TWh)","Electricity from renewables (TWh)"]
df_energy_data_cleaned = df_energy_data.dropna(subset=columns_to_clean)

Cleaning columns for the new Dataframes for the years 2000 and 2020

In [None]:
df_energy_data_2000_cleaned = df_energy_2000.dropna(subset=columns_to_clean)
df_energy_data_2020_cleaned = df_energy_2020.dropna(subset=columns_to_clean)

Check nulls were dropped

In [None]:
print(df_energy_data_cleaned["Electricity from fossil fuels (TWh)"].isnull().sum())
print(df_energy_data_cleaned["Electricity from nuclear (TWh)"].isnull().sum())
print(df_energy_data_cleaned["Electricity from renewables (TWh)"].isnull().sum())

# Plot your data to see what's in your DataFrame
df_energy_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data for Year 2000

In [None]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
df_energy_data_2000_scaled = StandardScaler().fit_transform(df_energy_data_2000_cleaned[["Electricity from fossil fuels (TWh)", "Electricity from nuclear (TWh)", 
                                                                        "Electricity from renewables (TWh)"]])
                                                                        # Create a DataFrame with the scaled data
df_energy_data_2000_scaled = pd.DataFrame(df_energy_data_2000_scaled, columns=["Electricity from fossil fuels (TWh)", "Electricity from nuclear (TWh)", "Electricity from renewables (TWh)" ])

# Copy the crypto names from the original data
df_energy_data_2000_scaled["Entity"] = df_energy_data_2000_cleaned.index

# Set the coinid column as index
df_energy_data_2000_scaled = df_energy_data_2000_scaled.set_index("Entity")

# Display sample data
df_energy_data_2000_scaled.head()

---

### Find the Best Value for k Using the Original Data.

In [None]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_energy_data_2000_scaled)
    inertia.append(k_model.inertia_)

In [None]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** 

According  to the elbow method, the best value for K is 3.

### Cluster Cryptocurrencies with K-means Using the Original Data

In [None]:
# Initialise the K-Means model using the best value for k
model = KMeans(n_clusters=3)

# Fit the K-Means model using the scaled data
model.fit(df_energy_data_2000_scaled)

# Predict the clusters to group the cryptocurrencies using the scaled data
energy_clusters = model.predict(df_energy_data_2000_scaled)

# Print the resulting array of cluster values.
print(energy_clusters)

In [None]:
# Create a copy of the DataFrame
df_energy_data_2000_scaled_predictions = df_energy_data_2000_scaled.copy()

# Add a new column to the DataFrame with the predicted clusters
df_energy_data_2000_scaled_predictions["EnergyCluster"] = energy_clusters

# Display sample data
df_energy_data_2000_scaled_predictions.head()

In [None]:
# Create a scatter plot using hvPlot by setting 
# `x=""` and `y=""`. 
# Colour the graph points with the labels found using K-Means and 
# add the Country name in the `hover_cols` parameter to identify 
# the Country represented by each data point.
df_energy_data_2000_scaled_predictions.hvplot.scatter(
    x="Electricity from fossil fuels (TWh)",
    y="Electricity from renewables (TWh)",
    hover_cols="Entity",
    by="EnergyCluster"
)

In [None]:
# Create a scatter plot using hvPlot by setting 
# `x=""` and `y=""`. 
# Colour the graph points with the labels found using K-Means and 
# add the Country name in the `hover_cols` parameter to identify 
# the Country represented by each data point.
df_energy_data_2000_scaled_predictions.hvplot.scatter(
    x="Electricity from nuclear (TWh)",
    y="Electricity from renewables (TWh)",
    hover_cols="Entity",
    by="EnergyCluster"
)

### Prepare the Data for Year 2020

In [None]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
df_energy_data_2020_scaled = StandardScaler().fit_transform(df_energy_data_2020_cleaned[["Electricity from fossil fuels (TWh)", "Electricity from nuclear (TWh)", 
                                                                        "Electricity from renewables (TWh)"]])
                                                                        # Create a DataFrame with the scaled data
df_energy_data_2020_scaled = pd.DataFrame(df_energy_data_2020_scaled, columns=["Electricity from fossil fuels (TWh)", "Electricity from nuclear (TWh)", "Electricity from renewables (TWh)" ])

# Copy the crypto names from the original data
df_energy_data_2020_scaled["Entity"] = df_energy_data_2020_cleaned.index

# Set the coinid column as index
df_energy_data_2020_scaled = df_energy_data_2020_scaled.set_index("Entity")

# Display sample data
df_energy_data_2020_scaled.head()

In [None]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_energy_data_2020_scaled)
    inertia.append(k_model.inertia_)

In [None]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [None]:
# Initialise the K-Means model using the best value for k
model = KMeans(n_clusters=3)

# Fit the K-Means model using the scaled data
model.fit(df_energy_data_2020_scaled)

# Predict the clusters to group the cryptocurrencies using the scaled data
energy_clusters = model.predict(df_energy_data_2020_scaled)

# Print the resulting array of cluster values.
print(energy_clusters)

In [None]:
# Create a copy of the DataFrame
df_energy_data_2020_scaled_predictions = df_energy_data_2020_scaled.copy()

# Add a new column to the DataFrame with the predicted clusters
df_energy_data_2020_scaled_predictions["EnergyCluster"] = energy_clusters

# Display sample data
df_energy_data_2020_scaled_predictions.head()

In [None]:
# Create a scatter plot using hvPlot by setting 
# `x=""` and `y=""`. 
# Colour the graph points with the labels found using K-Means and 
# add the Country name in the `hover_cols` parameter to identify 
# the Country represented by each data point.
df_energy_data_2020_scaled_predictions.hvplot.scatter(
    x="Electricity from fossil fuels (TWh)",
    y="Electricity from renewables (TWh)",
    hover_cols="Entity",
    by="EnergyCluster"
)

In [None]:
# Create a scatter plot using hvPlot by setting 
# `x=""` and `y=""`. 
# Colour the graph points with the labels found using K-Means and 
# add the Country name in the `hover_cols` parameter to identify 
# the Country represented by each data point.
df_energy_data_2020_scaled_predictions.hvplot.scatter(
    x="Electricity from nuclear (TWh)",
    y="Electricity from renewables (TWh)",
    hover_cols="Entity",
    by="EnergyCluster"
)

---

In [None]:
# Composite plot to contrast the clusters
df_energy_data_2000_scaled_predictions.hvplot.scatter(
    x="Electricity from fossil fuels (TWh)",
    y="Electricity from renewables (TWh)",
    hover_cols="Entity",
    by="EnergyCluster",
    title = "Fossil Fuels - Renewables - Year 2000"
) + df_energy_data_2020_scaled_predictions.hvplot.scatter(
    x="Electricity from fossil fuels (TWh)",
    y="Electricity from renewables (TWh)",
    hover_cols="Entity",
    by="EnergyCluster",
    title = "Fossil Fuels - Renewables - Year 2020"
)

In [None]:
# Composite plot to contrast the clusters
df_energy_data_2000_scaled_predictions.hvplot.scatter(
    x="Electricity from nuclear (TWh)",
    y="Electricity from renewables (TWh)",
    hover_cols="Entity",
    by="EnergyCluster",
    title = "Nuclear - Renewables - Year 2000"
) + df_energy_data_2020_scaled_predictions.hvplot.scatter(
    x="Electricity from nuclear (TWh)",
    y="Electricity from renewables (TWh)",
    hover_cols="Entity",
    by="EnergyCluster",
    title = "Nuclear - Renewables - Year 2020"
)