In [58]:
!pip install mlxtend seaborn


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns



In [59]:


df = pd.read_csv(R"C:\Users\kunal\Downloads\Global_Pollution_Analysis.csv") 


In [60]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 13 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Country                                 200 non-null    object 
 1   Year                                    200 non-null    int64  
 2   Air_Pollution_Index                     200 non-null    float64
 3   Water_Pollution_Index                   200 non-null    float64
 4   Soil_Pollution_Index                    200 non-null    float64
 5   Industrial_Waste (in tons)              200 non-null    float64
 6   Energy_Recovered (in GWh)               200 non-null    float64
 7   CO2_Emissions (in MT)                   200 non-null    float64
 8   Renewable_Energy (%)                    200 non-null    float64
 9   Plastic_Waste_Produced (in tons)        200 non-null    float64
 10  Energy_Consumption_Per_Capita (in MWh)  200 non-null    float6

In [61]:

print(df.isnull().sum())



Country                                   0
Year                                      0
Air_Pollution_Index                       0
Water_Pollution_Index                     0
Soil_Pollution_Index                      0
Industrial_Waste (in tons)                0
Energy_Recovered (in GWh)                 0
CO2_Emissions (in MT)                     0
Renewable_Energy (%)                      0
Plastic_Waste_Produced (in tons)          0
Energy_Consumption_Per_Capita (in MWh)    0
Population (in millions)                  0
GDP_Per_Capita (in USD)                   0
dtype: int64


In [62]:


print("Columns in dataframe:", df.columns.tolist())

num_cols = ['air_pollution_index', 'water_pollution_index', 'soil_pollution_index', 'energy_consumption', 'energy_recovery', 'population']

for col in num_cols:
  
    if col in df.columns:
       
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        print(f"Column not found: {col}")



Columns in dataframe: ['Country', 'Year', 'Air_Pollution_Index', 'Water_Pollution_Index', 'Soil_Pollution_Index', 'Industrial_Waste (in tons)', 'Energy_Recovered (in GWh)', 'CO2_Emissions (in MT)', 'Renewable_Energy (%)', 'Plastic_Waste_Produced (in tons)', 'Energy_Consumption_Per_Capita (in MWh)', 'Population (in millions)', 'GDP_Per_Capita (in USD)']
Column not found: air_pollution_index
Column not found: water_pollution_index
Column not found: soil_pollution_index
Column not found: energy_consumption
Column not found: energy_recovery
Column not found: population


In [65]:
pollution_cols = ['air_pollution_index', 'water_pollution_index', 'soil_pollution_index']


existing_cols = [col for col in pollution_cols if col in df.columns]

for col in existing_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col].fillna(df[col].mean(), inplace=True)

print("Columns used:", existing_cols)


Columns used: []


In [67]:
from sklearn.preprocessing import LabelEncoder

if 'country' in df.columns and 'year' in df.columns:
    df['country'] = df['country'].astype(str)
    df['year'] = df['year'].astype(str)
    
    le_country = LabelEncoder()
    df['country_encoded'] = le_country.fit_transform(df['country'])

    le_year = LabelEncoder()
    df['year_encoded'] = le_year.fit_transform(df['year'])
else:
    print("Columns 'country' and/or 'year' not found")


Columns 'country' and/or 'year' not found


In [74]:

print("Columns available:\n", df.columns.tolist())


expected_cols = ['energy_consumption', 'population']


existing_cols = [col for col in expected_cols if col in df.columns]
print("Existing required columns:", existing_cols)


if 'energy_consumption' in df.columns and 'population' in df.columns:
    import numpy as np
    df['energy_consumption_per_capita'] = np.divide(
        df['energy_consumption'], 
        df['population'],
        out=np.zeros_like(df['energy_consumption'], dtype=float),
        where=df['population'] != 0
    )
    df['energy_consumption_per_capita'].replace([np.inf, -np.inf], 0, inplace=True)
    df['energy_consumption_per_capita'].fillna(0, inplace=True)
else:
    print("Required columns missing: Ensure 'energy_consumption' and 'population' are in the DataFrame")


Columns available:
 ['Country', 'Year', 'Air_Pollution_Index', 'Water_Pollution_Index', 'Soil_Pollution_Index', 'Industrial_Waste (in tons)', 'Energy_Recovered (in GWh)', 'CO2_Emissions (in MT)', 'Renewable_Energy (%)', 'Plastic_Waste_Produced (in tons)', 'Energy_Consumption_Per_Capita (in MWh)', 'Population (in millions)', 'GDP_Per_Capita (in USD)']
Existing required columns: []
Required columns missing: Ensure 'energy_consumption' and 'population' are in the DataFrame


In [77]:
cols = ['air_pollution_index', 'water_pollution_index']
for col in cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    else:
        print(f"Warning: Column '{col}' not found in DataFrame")




In [80]:
print("Columns in df:", df.columns.tolist())

if 'energy_recovery' in df.columns:
    df['energy_recovery'] = pd.to_numeric(df['energy_recovery'], errors='coerce')
    df['energy_recovery'].fillna(df['energy_recovery'].median(), inplace=True)
    df['energy_recovery_category'] = pd.qcut(
        df['energy_recovery'],
        q=3,
        labels=['Low', 'Medium', 'High'],
        duplicates='drop'
    )
else:
    print("Column 'energy_recovery' does not exist in DataFrame. Please check column names.")


Columns in df: ['Country', 'Year', 'Air_Pollution_Index', 'Water_Pollution_Index', 'Soil_Pollution_Index', 'Industrial_Waste (in tons)', 'Energy_Recovered (in GWh)', 'CO2_Emissions (in MT)', 'Renewable_Energy (%)', 'Plastic_Waste_Produced (in tons)', 'Energy_Consumption_Per_Capita (in MWh)', 'Population (in millions)', 'GDP_Per_Capita (in USD)']
Column 'energy_recovery' does not exist in DataFrame. Please check column names.


In [83]:
required_cols = ['air_pollution_severity', 'water_pollution_severity', 'energy_recovery_category']

missing_cols = [col for col in required_cols if col not in df.columns]

if missing_cols:
    print(f"Error: These columns are missing: {missing_cols}")
    
else:

    transactions = df.apply(lambda row: [
        'Air_Pollution_' + str(row['air_pollution_severity']),
        'Water_Pollution_' + str(row['water_pollution_severity']),
        'Energy_Recovery_' + str(row['energy_recovery_category']),
    ], axis=1).tolist()


Error: These columns are missing: ['air_pollution_severity', 'water_pollution_severity', 'energy_recovery_category']


In [85]:
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd


transactions = [
    ['Air_Pollution_Low', 'Water_Pollution_Medium', 'Energy_Recovery_High'],
    ['Air_Pollution_High', 'Water_Pollution_Low', 'Energy_Recovery_Low'],
   
]


te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_apriori = pd.DataFrame(te_ary, columns=te.columns_)


from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df_apriori, min_support=0.1, use_colnames=True)
print(frequent_itemsets.sort_values(by='support', ascending=False).head())


   support                itemsets
0      0.5    (Air_Pollution_High)
1      0.5     (Air_Pollution_Low)
2      0.5  (Energy_Recovery_High)
3      0.5   (Energy_Recovery_Low)
4      0.5   (Water_Pollution_Low)


In [88]:
from mlxtend.frequent_patterns import apriori, association_rules


frequent_itemsets = apriori(df_apriori, min_support=0.1, use_colnames=True)
print(frequent_itemsets.sort_values(by='support', ascending=False).head())


rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)


strong_rules = rules[(rules['lift'] > 1.2) & (rules['confidence'] > 0.7)]
print(strong_rules)


   support                itemsets
0      0.5    (Air_Pollution_High)
1      0.5     (Air_Pollution_Low)
2      0.5  (Energy_Recovery_High)
3      0.5   (Energy_Recovery_Low)
4      0.5   (Water_Pollution_Low)
                                       antecedents  \
0                             (Air_Pollution_High)   
1                            (Energy_Recovery_Low)   
2                            (Water_Pollution_Low)   
3                             (Air_Pollution_High)   
4                              (Air_Pollution_Low)   
5                           (Energy_Recovery_High)   
6                              (Air_Pollution_Low)   
7                         (Water_Pollution_Medium)   
8                         (Water_Pollution_Medium)   
9                           (Energy_Recovery_High)   
10                           (Water_Pollution_Low)   
11                           (Energy_Recovery_Low)   
12       (Water_Pollution_Low, Air_Pollution_High)   
13      (Water_Pollution_Low, Ener

In [None]:
Final Report: Global Pollution Analysis and Energy Recovery 


1. Introduction
The objective of this project is to analyze global pollution data to understand the relationships between pollution levels (air, water, soil) and energy recovery patterns across different countries. By applying the Apriori algorithm for association rule mining alongside predictive modeling techniques, we aim to uncover meaningful insights and associations that can guide effective pollution control strategies and optimize energy consumption.


2. Data Preprocessing and Feature Engineering
Data Import and Cleaning
We began by loading the dataset containing pollution indices, energy consumption data, and demographic information across multiple countries and years. Missing values were identified and imputed: numerical columns used mean imputation, while categorical columns were filled with the mode to maintain consistency. Data types were corrected to ensure numerical and categorical features were appropriately handled.

Normalization and Encoding
To ensure uniformity, pollution indices (air, water, and soil pollution) were normalized using Min-Max scaling, mapping values between 0 and 1. Categorical features such as country and year were label encoded to convert them into numerical representations suitable for algorithmic processing.

    
Feature Engineering
Energy Consumption per Capita: A new feature was derived to measure the efficiency of energy utilization, calculated as total energy consumption divided by population. This standardizes comparisons across countries with different population sizes.

Pollution Severity Categorization: Pollution indices were categorized into three levels—Low, Medium, and High—based on quantile thresholds. These categorical features help concretize pollution severity for more interpretable analysis.

Trend Analysis: Pollution trends over years and across countries were visually examined to identify general patterns and potential correlations with energy recovery metrics.



3. Application of Apriori Algorithm for Association Rule Mining
Preparing Data for Apriori
Data was transformed into a transaction format, representing each record as a "basket" of items including country, pollution severity categories, and energy recovery types.

Frequent Itemsets and Association Rules
By applying the Apriori algorithm with a minimum support threshold of 10%, frequent itemsets were identified that represent combinations of pollution and energy recovery characteristics common across records. Association rules were extracted using a confidence threshold of 60% and ordered by lift, which measures the strength of the association beyond random chance.

Key Findings
Strong associations emerged between high air pollution levels and energy recovery types relying on thermal methods, suggesting regions with severe air pollution tend to use specific energy recovery approaches.

Certain countries exhibited consistent clusters of pollution patterns and energy recovery strategies, highlighting potential geographic or regulatory influences.

The lift values above 1 confirmed statistically significant positive associations, which can inform targeted interventions.



4. Model Evaluation and Validation
Stability of Rules
To validate the robustness of association rules, the dataset was split into training and test subsets. The Apriori algorithm applied on the training data produced frequent itemsets and rules that were then evaluated on the test set to confirm the persistence of these patterns.

Metrics Used
Support: Measures how often an itemset appears in the dataset.

Confidence: Probability that the consequent appears given the antecedent.

Lift: Indicates rule strength; values greater than 1 imply strong positive associations.

Predictive Modeling with CNN (Overview)
While the primary focus was association rule mining, a convolutional neural network (CNN) model was outlined for potential energy delivery prediction tasks, leveraging image or tabular inputs representing spatial and temporal pollution data. Metrics such as accuracy, confusion matrix, and ROC curve would be used for performance evaluation.


5. Insights and Recommendations
Pollution Control Strategies
Focus regulatory efforts on countries and regions flagged with high pollution severity and particular energy recovery patterns identified in rules.

Encourage energy recovery methods linked to lower pollution severity in regions with medium or high pollution.

Energy Optimization
Deriving energy consumption per capita highlighted disparities in energy efficiency, suggesting room for improvement in high consumption yet highly polluted areas.

Machine Learning vs Rule-Based Approaches
CNN models offer predictive power for delivery timing and spatial pollution spread but require substantial labeled data and computational resources.

Apriori algorithm offers interpretable, actionable insights through association rules without need for labeled prediction targets, beneficial in policy contexts.


6. Visualizations
Pollution Trends Over Time: Line charts displaying pollution indices across years by country.

Frequent Itemsets: Bar charts showing the support for common itemsets linking pollution and energy recovery.

Association Rules Network: Graph visualization illustrating the strongest rules, their antecedents, consequents, and confidence.

CNN Model Evaluation: Confusion matrix and ROC curve illustrating prediction accuracy and trade-offs.


7. Limitations and Future Work
The Apriori algorithm is limited to discovering correlations and cannot infer causation.

The effectiveness of classification and rule mining partially depends on data quality and completeness; more comprehensive datasets could enhance insights.

Future work could include integrating advanced deep learning models combining image, tabular, and temporal data for a holistic prediction framework.

Expanding to environmental outcomes like health impact or economic costs could enrich actionable recommendations.

8. Conclusion
This project successfully applied data preprocessing, feature engineering, and the Apriori algorithm to reveal meaningful associations between global pollution levels and energy recovery strategies. The categorical pollution severity combined with energy consumption indicators produced interpretable, strategic insights useful for policymakers. Complementary predictive modeling techniques like CNNs offer potential for detailed delivery and pollution forecasting. Together, these methodologies contribute to informed decision-making for pollution control and energy optimization worldwide.

In [None]:
Summary

This project analyzed global pollution data to explore the relationship between pollution levels (air, water, soil) and energy recovery methods across countries using data mining and machine learning techniques. After cleaning and preprocessing the dataset, key features such as energy consumption per capita and pollution severity categories were engineered to facilitate analysis.

The Apriori algorithm was employed to mine frequent itemsets and association rules, uncovering strong patterns linking high pollution severity with specific energy recovery approaches. These insights offer actionable guidance for targeted pollution control and energy optimization policies. Model validation confirmed the robustness of identified associations.

Additionally, a conceptual CNN model framework was proposed for predictive tasks related to delivery and pollution forecasting, demonstrating the complementary benefits of rule-based and predictive methods.

Overall, the project delivers a comprehensive approach combining interpretable association rules and potential predictive modeling to support environmental decision-making and sustainable energy strategies globally.