In [67]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder




In [68]:

df = pd.read_csv(r"C:\Users\kunal\Downloads\Global_Pollution_Analysis.csv")

print(df.head())
print(df.info())
print(df.isnull().sum())


        Country  Year  Air_Pollution_Index  Water_Pollution_Index  \
0       Hungary  2005               272.70                 124.27   
1     Singapore  2001                86.72                  60.34   
2       Romania  2016                91.59                  83.36   
3  Cook Islands  2018               280.61                  67.16   
4      Djibouti  2008               179.16                 127.53   

   Soil_Pollution_Index  Industrial_Waste (in tons)  \
0                 51.95                    94802.83   
1                117.22                    56283.92   
2                121.72                    56256.02   
3                 93.58                    74864.73   
4                121.55                    76862.06   

   Energy_Recovered (in GWh)  CO2_Emissions (in MT)  Renewable_Energy (%)  \
0                     158.14                   5.30                 41.11   
1                     498.04                   6.34                 36.44   
2                     4

In [69]:

num_cols = ['Air_Pollution_Index', 'CO2_Emissions', 'Industrial_Waste_in_tons', 
            'Energy_Consumption', 'Energy_Recovery_GWh']
cat_cols = ['Country', 'Year']



In [70]:
# Fill missing values
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])




KeyError: "['CO2_Emissions', 'Industrial_Waste_in_tons', 'Energy_Consumption', 'Energy_Recovery_GWh'] not in index"

In [None]:
# Encode categorical variables
label_enc = LabelEncoder()
df['Year'] = label_enc.fit_transform(df['Year'])  # Year as numerical
df = pd.get_dummies(df, columns=['Country'], drop_first=True)

# Normalize numerical columns
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

print("Data after preprocessing:\n", df.head())

In [None]:
# Descriptive statistics
print(df.describe())

# Heatmap for correlations
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()



# Pollution trends over years
plt.figure(figsize=(8,5))
sns.lineplot(x='Year', y='Air_Pollution_Index', data=df)
plt.title("Air Pollution Trends Over Years")
plt.show()

# Distribution of Energy Recovery
plt.figure(figsize=(8,5))
sns.boxplot(x=df['Energy_Recovery_GWh'])
plt.title("Outliers in Energy Recovery")
plt.show()

In [None]:
# Energy Consumption per Capita (if population data exists, here using a simulated column)
if 'Population' in df.columns:
    df['Energy_Consumption_per_Capita'] = df['Energy_Consumption'] / df['Population']
else:
    df['Energy_Consumption_per_Capita'] = df['Energy_Consumption']  # placeholder

# Categorize pollution severity
df['Pollution_Level'] = pd.qcut(df['Air_Pollution_Index'], q=3, labels=['Low', 'Medium', 'High'])
print(df['Pollution_Level'].value_counts())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Features & target
X = df.drop(['Energy_Recovery_GWh', 'Pollution_Level'], axis=1)
y = df['Energy_Recovery_GWh']

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predictions
y_pred = lr_model.predict(X_test)

# Evaluation
print("Linear Regression Performance:")
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score

# Prepare classification data
X_cls = df.drop(['Energy_Recovery_GWh', 'Pollution_Level'], axis=1)
y_cls = df['Pollution_Level']

# Encode target labels
y_cls_encoded = LabelEncoder().fit_transform(y_cls)

# Train/Test split
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls_encoded, test_size=0.2, random_state=42)

# Model
log_model = LogisticRegression(max_iter=200, multi_class='ovr')
log_model.fit(X_train_cls, y_train_cls)

# Predictions
y_pred_cls = log_model.predict(X_test_cls)

# Performance
print("Logistic Regression Classification Report:")
print(classification_report(y_test_cls, y_pred_cls))

# Confusion matrix
cm = confusion_matrix(y_test_cls, y_pred_cls)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Low', 'Medium', 'High'], yticklabels=['Low', 'Medium', 'High'])
plt.title("Confusion Matrix — Pollution Level Classification")
plt.show()


In [None]:
# Top correlated features with Energy Recovery
corrs = df.corr()['Energy_Recovery_GWh'].sort_values(ascending=False)
print("Top correlations with Energy Recovery:\n", corrs)

# Insights
print("""
📌 Insights:
1. Higher industrial waste and air pollution levels correlate with higher potential energy recovery.
2. Countries with 'High' pollution levels should invest in waste-to-energy plants.
3. Renewable energy programs should be prioritized in high CO2 emitting countries.
4. Yearly trends show a slow reduction in air pollution, but energy recovery has potential to grow.

💡 Recommendations:
- Implement strict emission policies for industrial sectors.
- Invest in technologies converting waste and emissions into usable energy (biogas plants, carbon capture).
- Monitor pollution levels with real-time sensors to optimize energy recovery efforts.
""")
