1. Data Import and Cleaning

First, let's import the necessary libraries and load the dataset:
python


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the dataset
url = "https://wri-dataportal-prod.s3.amazonaws.com/manual/global_power_plant_database_v_1_3.zip"
# For local file:
df = pd.read_csv('global_power_plant_database.csv')

# Display basic info
print(df.info())
print(df.head())

Handling Missing Values
python

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])

# Handle missing values
# For numerical columns, we'll fill with median
numerical_cols = ['capacity_mw', 'latitude', 'longitude', 'generation_gwh_2013', 
                 'generation_gwh_2014', 'generation_gwh_2015', 'generation_gwh_2016',
                 'generation_gwh_2017', 'generation_gwh_2018', 'generation_gwh_2019']

for col in numerical_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col].fillna(df[col].median(), inplace=True)

# For categorical columns, fill with mode or 'Unknown'
categorical_cols = ['primary_fuel', 'owner', 'source', 'url', 'geolocation_source']
for col in categorical_cols:
    if col in df.columns:
        df[col].fillna(df[col].mode()[0] if df[col].notna().any() else 'Unknown', inplace=True)

# Convert relevant columns to numerical types using NumPy
df['capacity_mw'] = np.array(df['capacity_mw'], dtype=np.float64)
if 'commissioning_year' in df.columns:
    df['commissioning_year'] = pd.to_numeric(df['commissioning_year'], errors='coerce')
    df['commissioning_year'] = np.array(df['commissioning_year'], dtype=np.float64)



2. Exploratory Data Analysis
Key Statistics

In [None]:
# Summary statistics for numerical columns
print(df.describe())

# Specific statistics using NumPy
print("\nCapacity statistics:")
print(f"Mean capacity: {np.mean(df['capacity_mw']):.2f} MW")
print(f"Median capacity: {np.median(df['capacity_mw']):.2f} MW")
print(f"Standard deviation: {np.std(df['capacity_mw']):.2f} MW")
print(f"25th percentile: {np.percentile(df['capacity_mw'], 25):.2f} MW")
print(f"75th percentile: {np.percentile(df['capacity_mw'], 75):.2f} MW")

Distribution by Country and Fuel Type
python

# Power plants by country
country_dist = df['country'].value_counts().head(10)
print("\nTop 10 countries by number of power plants:")
print(country_dist)

# Power plants by fuel type
fuel_dist = df['primary_fuel'].value_counts()
print("\nPower plants by primary fuel type:")
print(fuel_dist)

# Capacity by fuel type
fuel_capacity = df.groupby('primary_fuel')['capacity_mw'].sum().sort_values(ascending=False)
print("\nTotal capacity by primary fuel type (MW):")
print(fuel_capacity)

3. Statistical Analysis

In [None]:
# Analyze generation data (using 2019 as example)
if 'generation_gwh_2019' in df.columns:
    # Filter out plants with no generation data
    gen_data = df[df['generation_gwh_2019'].notna()]
    
    # Group by fuel type and calculate statistics
    fuel_stats = gen_data.groupby('primary_fuel')['generation_gwh_2019'].agg([np.mean, np.median, np.std, np.size])
    print("\nGeneration statistics by fuel type (2019):")
    print(fuel_stats)
    
    # Hypothesis testing - compare two fuel types (e.g., Coal vs Hydro)
    coal = gen_data[gen_data['primary_fuel'] == 'Coal']['generation_gwh_2019']
    hydro = gen_data[gen_data['primary_fuel'] == 'Hydro']['generation_gwh_2019']
    
    # Perform t-test
    t_stat, p_value = stats.ttest_ind(coal, hydro, equal_var=False, nan_policy='omit')
    print(f"\nT-test between Coal and Hydro plants:")
    print(f"T-statistic: {t_stat:.2f}, P-value: {p_value:.4f}")
    
    if p_value < 0.05:
        print("The difference in mean generation is statistically significant.")
    else:
        print("The difference in mean generation is not statistically significant.")

4. Time Series Analysis

In [None]:
if 'commissioning_year' in df.columns:
    # Filter valid years
    valid_years = df[(df['commissioning_year'] >= 1900) & (df['commissioning_year'] <= 2020)]
    
    # Power plants commissioned by year
    plants_by_year = valid_years['commissioning_year'].value_counts().sort_index()
    
    # Capacity added by year
    capacity_by_year = valid_years.groupby('commissioning_year')['capacity_mw'].sum()
    
    # Plotting
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plants_by_year.plot()
    plt.title('Number of Power Plants Commissioned by Year')
    plt.xlabel('Year')
    plt.ylabel('Count')
    
    plt.subplot(1, 2, 2)
    capacity_by_year.plot()
    plt.title('Total Capacity Added by Year (MW)')
    plt.xlabel('Year')
    plt.ylabel('Capacity (MW)')
    plt.tight_layout()
    plt.show()


if 'commissioning_year' in df.columns:
    # Create decade bins
    df['decade'] = (valid_years['commissioning_year'] // 10) * 10
    
    # Pivot table for fuel type by decade
    fuel_evolution = pd.pivot_table(valid_years, 
                                  values='capacity_mw', 
                                  index='decade', 
                                  columns='primary_fuel', 
                                  aggfunc=np.sum,
                                  fill_value=0)
    
    # Plot stacked area chart
    plt.figure(figsize=(12, 6))
    fuel_evolution.plot(kind='area', stacked=True, alpha=0.7)
    plt.title('Evolution of Power Generation Capacity by Fuel Type')
    plt.xlabel('Decade')
    plt.ylabel('Total Capacity (MW)')
    plt.legend(title='Fuel Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()



5. Advanced Visualization
Geographical Distribution
python



In [None]:
# Plot power plants on a world map (simplified version)
plt.figure(figsize=(15, 10))
plt.scatter(df['longitude'], df['latitude'], 
           c=df['capacity_mw'], 
           s=np.log(df['capacity_mw']+1)*5, 
           alpha=0.5, 
           cmap='viridis')
plt.colorbar(label='Capacity (MW)')
plt.title('Global Distribution of Power Plants (Size by Capacity)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

# Fuel type distribution by capacity
plt.figure(figsize=(12, 8))
sns.boxplot(x='primary_fuel', y='capacity_mw', data=df)
plt.xticks(rotation=45)
plt.yscale('log')  # Log scale due to large outliers
plt.title('Power Plant Capacity Distribution by Fuel Type')
plt.xlabel('Primary Fuel Type')
plt.ylabel('Capacity (MW, log scale)')
plt.show()

Capacity vs Generation Scatter Plot
python

if 'generation_gwh_2019' in df.columns:
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x='capacity_mw', y='generation_gwh_2019', 
                   hue='primary_fuel', data=df, alpha=0.6)
    plt.xscale('log')
    plt.yscale('log')
    plt.title('Capacity vs Generation (2019) by Fuel Type')
    plt.xlabel('Capacity (MW, log scale)')
    plt.ylabel('Generation (GWh, log scale)')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()



6. Matrix Operations in Real-World Context

In [None]:
# Select numerical columns for correlation
numerical_data = df.select_dtypes(include=[np.number])
if 'generation_gwh_2019' in numerical_data.columns:
    # Calculate correlation matrix
    corr_matrix = numerical_data.corr()
    
    # Plot heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm',
               mask=np.triu(np.ones_like(corr_matrix, dtype=bool)))
    plt.title('Correlation Matrix of Numerical Variables')
    plt.show()

# PCA Analysis (Dimensionality Reduction)


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Prepare data for PCA
if 'generation_gwh_2019' in numerical_data.columns:
    pca_data = numerical_data.dropna()
    pca_data = StandardScaler().fit_transform(pca_data)
    
    # Perform PCA
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(pca_data)
    
    # Plot results
    plt.figure(figsize=(10, 8))
    plt.scatter(principal_components[:, 0], principal_components[:, 1], alpha=0.5)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('PCA of Power Plant Data')
    
    # Print explained variance
    print(f"Explained variance by PC1: {pca.explained_variance_ratio_[0]:.2f}")
    print(f"Explained variance by PC2: {pca.explained_variance_ratio_[1]:.2f}")
    print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.2f}")
    
    # Eigenvalues and eigenvectors
    print("\nEigenvalues:")
    print(pca.explained_variance_)
    print("\nEigenvectors:")
    print(pca.components_)

7. Integrating NumPy with Pandas and Matplotlib

In [None]:

# Example: Find plants with capacity in top 10% and generation efficiency above median
if 'generation_gwh_2019' in df.columns:
    capacity_threshold = np.percentile(df['capacity_mw'], 90)
    df['efficiency'] = df['generation_gwh_2019'] / (df['capacity_mw'] * 24 * 365 / 1000)  # Capacity factor
    median_efficiency = np.nanmedian(df['efficiency'])
    
    # Create boolean mask using NumPy
    mask = (df['capacity_mw'].values > capacity_threshold) & \
           (df['efficiency'].values > median_efficiency)
    
    high_performers = df[mask]
    print(f"\nFound {len(high_performers)} high-performing power plants:")
    print(high_performers[['name', 'country', 'primary_fuel', 'capacity_mw', 'efficiency']].head())

# Sophisticated Plot with NumPy Calculations

# Kernel Density Estimation of capacity by fuel type using NumPy
top_fuels = df['primary_fuel'].value_counts().head(5).index
fuel_data = {fuel: df[df['primary_fuel'] == fuel]['capacity_mw'].values for fuel in top_fuels}

plt.figure(figsize=(12, 8))
for fuel, data in fuel_data.items():
    # Use NumPy for KDE calculation
    kde = stats.gaussian_kde(data)
    x_vals = np.linspace(min(data), max(data), 1000)
    y_vals = kde(x_vals)
    plt.plot(x_vals, y_vals, label=fuel)

plt.xscale('log')
plt.title('Kernel Density Estimation of Capacity by Fuel Type')
plt.xlabel('Capacity (MW, log scale)')
plt.ylabel('Density')
plt.legend()
plt.show()

Conclusion

This analysis demonstrates how to effectively use NumPy, Pandas, and Matplotlib together to analyze a complex real-world dataset:

    NumPy provided efficient numerical operations, statistical functions, and array manipulations that enhanced our analysis.

    Pandas offered powerful data structures and manipulation tools for cleaning, filtering, and aggregating the data.

    Matplotlib/Seaborn enabled us to create informative visualizations that reveal patterns in the data.

Key findings might include:

    The distribution of power plants by country and fuel type

    Statistical differences in output between fuel types

    Historical trends in power plant construction

    Geographical patterns in plant locations

    Correlations between different plant characteristics

The integration of these libraries allows for a comprehensive analysis workflow from data cleaning to advanced statistical analysis and visualization.