In [54]:
import pandas as pd
import numpy as np

# Load dataset
file_path = r"C:\Users\kunal\Downloads\Global_Pollution_Analysis.csv"
df = pd.read_csv(file_path)

# Initial data overview
print(df.info())
print(df.head())

# Handling missing data: Impute numerical with median, categorical with mode
for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].median(), inplace=True)

for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 13 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Country                                 200 non-null    object 
 1   Year                                    200 non-null    int64  
 2   Air_Pollution_Index                     200 non-null    float64
 3   Water_Pollution_Index                   200 non-null    float64
 4   Soil_Pollution_Index                    200 non-null    float64
 5   Industrial_Waste (in tons)              200 non-null    float64
 6   Energy_Recovered (in GWh)               200 non-null    float64
 7   CO2_Emissions (in MT)                   200 non-null    float64
 8   Renewable_Energy (%)                    200 non-null    float64
 9   Plastic_Waste_Produced (in tons)        200 non-null    float64
 10  Energy_Consumption_Per_Capita (in MWh)  200 non-null    float6

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [55]:
from sklearn.preprocessing import StandardScaler

pollution_cols = ['air_pollution_index', 'water_pollution_index', 'soil_pollution_index']
scaler = StandardScaler()
df[pollution_cols] = scaler.fit_transform(df[pollution_cols])


KeyError: "None of [Index(['air_pollution_index', 'water_pollution_index', 'soil_pollution_index'], dtype='object')] are in the [columns]"

In [None]:
from sklearn.preprocessing import LabelEncoder

le_country = LabelEncoder()
le_year = LabelEncoder()

df['country'] = le_country.fit_transform(df['country'])
df['year'] = le_year.fit_transform(df['year'])


In [None]:
# Energy consumption per capita
df['energy_per_capita'] = df['energy_consumption'] / df['population']

# Yearly pollution trend (change in pollution over years per country)
df = df.sort_values(['country', 'year'])
df['pollution_trend'] = df.groupby('country')['air_pollution_index'].diff().fillna(0)


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

features = ['air_pollution_index', 'water_pollution_index', 'soil_pollution_index', 'energy_per_capita']

X = df[features]

# Elbow method to find optimal clusters
inertia = []
for k in range(1, 11):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X)
    inertia.append(km.inertia_)

plt.plot(range(1, 11), inertia, marker='o')
plt.title("Elbow Method")
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.show()

# Fit KMeans with optimal k (e.g., k=3)
kmeans = KMeans(n_clusters=3, random_state=42)
df['kmeans_cluster'] = kmeans.fit_predict(X)


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

sample_data = X.sample(n=500, random_state=42)  # sample for dendrogram if large data
linked = linkage(sample_data, method='ward')

plt.figure(figsize=(10, 7))
dendrogram(linked, truncate_mode='level', p=5)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index or cluster size')
plt.ylabel('Distance')
plt.show()

from sklearn.cluster import AgglomerativeClustering

agg = AgglomerativeClustering(n_clusters=3)
df['hierarchical_cluster'] = agg.fit_predict(X)


In [None]:
from sklearn.model_selection import train_test_split

target = 'energy_recovered_gwh'
features_nn = ['air_pollution_index', 'water_pollution_index', 'soil_pollution_index', 'co2_emissions', 'industrial_waste']

X_nn = df[features_nn]
y_nn = df[target]

X_train, X_test, y_train, y_test = train_test_split(X_nn, y_nn, test_size=0.2, random_state=42)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_test).flatten()

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse:.4f}, MAE: {mae:.4f}, R^2: {r2:.4f}')

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Model Loss During Training")
plt.xlabel("Epoch")
plt.ylabel("Loss (MSE)")
plt.legend()
plt.show()


In [None]:
 Reporting and Insights
Compare clustering results: Analyze cluster characteristics, showing pollution levels and energy recovery.

Interpret neural network predictions: Discuss model performance and potential improvements.

Actionable insights: Recommend pollution reduction policies based on clusters; forecast energy recovery impact.

Visualizations: Include elbow plots, dendrograms, cluster scatter plots, and training curves.

Final Deliverables
Jupyter Notebook (.ipynb) with all code and visualizations.

Embedded charts for clustering and neural network training.

Written report summarizing methodology, results, key findings, and practical recommendations.