In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(r"C:\Users\kunal\Downloads\Global_Pollution_Analysis.csv")  
print(df.head())


ModuleNotFoundError: No module named 'mlxtend'

In [None]:

print(df.isnull().sum())


num_cols = ['air_pollution_index', 'water_pollution_index', 'soil_pollution_index', 'energy_consumption', 'energy_recovery', 'population']
for col in num_cols:
    df[col].fillna(df[col].mean(), inplace=True)


cat_cols = ['country', 'year']
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
scaler = MinMaxScaler()
pollution_cols = ['air_pollution_index', 'water_pollution_index', 'soil_pollution_index']
df[pollution_cols] = scaler.fit_transform(df[pollution_cols])


In [None]:
le_country = LabelEncoder()
df['country_encoded'] = le_country.fit_transform(df['country'])

le_year = LabelEncoder()
df['year_encoded'] = le_year.fit_transform(df['year'])


In [None]:
df['energy_consumption_per_capita'] = df['energy_consumption'] / df['population']
df['energy_consumption_per_capita'].fillna(0, inplace=True) 


In [None]:
def categorize_pollution(val):
    if val < 0.33:
        return 'Low'
    elif val < 0.66:
        return 'Medium'
    else:
        return 'High'

df['air_pollution_severity'] = df['air_pollution_index'].apply(categorize_pollution)
df['water_pollution_severity'] = df['water_pollution_index'].apply(categorize_pollution)


In [None]:
df['energy_recovery_category'] = pd.qcut(df['energy_recovery'], q=3, labels=['Low', 'Medium', 'High'])


In [None]:

transactions = df.apply(lambda row: [
    'Air_Pollution_' + row['air_pollution_severity'],
    'Water_Pollution_' + row['water_pollution_severity'],
    'Energy_Recovery_' + str(row['energy_recovery_category'])
], axis=1).tolist()


te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_apriori = pd.DataFrame(te_ary, columns=te.columns_)


In [None]:
# Frequent Itemsets
frequent_itemsets = apriori(df_apriori, min_support=0.1, use_colnames=True)
print(frequent_itemsets.sort_values(by='support', ascending=False).head())

# Association Rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='lift', ascending=False).head())


In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=rules, x='support', y='confidence', size='lift', legend=False, sizes=(20, 200))
plt.title('Association Rules: Support vs Confidence (size by Lift)')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.show()
