In [40]:
import pandas as pd
import numpy as np

# Load your data (update the path to your local file or use gdown for Google Drive)
file_path = r"C:\Users\kunal\Downloads\Global_Pollution_Analysis.csv"
df = pd.read_csv(file_path)

# First look at the data
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 13 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Country                                 200 non-null    object 
 1   Year                                    200 non-null    int64  
 2   Air_Pollution_Index                     200 non-null    float64
 3   Water_Pollution_Index                   200 non-null    float64
 4   Soil_Pollution_Index                    200 non-null    float64
 5   Industrial_Waste (in tons)              200 non-null    float64
 6   Energy_Recovered (in GWh)               200 non-null    float64
 7   CO2_Emissions (in MT)                   200 non-null    float64
 8   Renewable_Energy (%)                    200 non-null    float64
 9   Plastic_Waste_Produced (in tons)        200 non-null    float64
 10  Energy_Consumption_Per_Capita (in MWh)  200 non-null    float6

In [41]:
# Identify missing data
print(df.isnull().sum())

# Impute numerical columns with median, categorical with mode
for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].median(), inplace=True)
for col in df.select_dtypes(include=object).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Outlier handling (example: clip values at 1st and 99th percentile)
numerical_cols = df.select_dtypes(include=np.number).columns
for col in numerical_cols:
    q01 = df[col].quantile(0.01)
    q99 = df[col].quantile(0.99)
    df[col] = np.clip(df[col], q01, q99)


Country                                   0
Year                                      0
Air_Pollution_Index                       0
Water_Pollution_Index                     0
Soil_Pollution_Index                      0
Industrial_Waste (in tons)                0
Energy_Recovered (in GWh)                 0
CO2_Emissions (in MT)                     0
Renewable_Energy (%)                      0
Plastic_Waste_Produced (in tons)          0
Energy_Consumption_Per_Capita (in MWh)    0
Population (in millions)                  0
GDP_Per_Capita (in USD)                   0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [42]:
from sklearn.preprocessing import StandardScaler

cols_to_scale = ['CO2_emissions', 'industrial_waste']  # Update as per your columns
scaler = StandardScaler()
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])


KeyError: "None of [Index(['CO2_emissions', 'industrial_waste'], dtype='object')] are in the [columns]"

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in ['country', 'year']:  # Update with your categorical columns
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [43]:
# Example: Energy consumption per capita
df['energy_per_capita'] = df['energy_consumption'] / df['population']

# Example: Yearly pollution trend (difference from previous year)
df.sort_values(['country', 'year'], inplace=True)
df['yearly_pollution_trend'] = df.groupby('country')['CO2_emissions'].diff().fillna(0)


KeyError: 'energy_consumption'

In [None]:
pollution_features = ['air_pollution_index', 'water_pollution_index', 'soil_pollution_index']
df[pollution_features] = scaler.fit_transform(df[pollution_features])


In [None]:
# Suppose 'pollution_severity' is already in the data, otherwise create it:
# For demonstration, categorize by quantiles
df['pollution_severity'] = pd.qcut(df['CO2_emissions'], q=3, labels=['Low', 'Medium', 'High'])


In [None]:
from sklearn.model_selection import train_test_split

features = ['CO2_emissions', 'industrial_waste', 'energy_per_capita', 'air_pollution_index', 'water_pollution_index', 'soil_pollution_index', 'yearly_pollution_trend']
X = df[features]
y = df['pollution_severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

nb = MultinomialNB()
y_pred_nb = nb.fit(X_train, y_train).predict(X_test)

print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Naive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': range(3, 16)}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='f1_macro')
grid.fit(X_train, y_train)
knn_best = grid.best_estimator_
y_pred_knn = knn_best.predict(X_test)

print("Best K for KNN:", grid.best_params_)
print("KNN Classification Report:\n", classification_report(y_test, y_pred_knn))
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))


In [None]:
from sklearn.tree import DecisionTreeClassifier

param_grid = {'max_depth': [4, 6, 8, 10], 'min_samples_split': [2, 5, 10]}
dt = DecisionTreeClassifier(random_state=42)
grid_dt = GridSearchCV(dt, param_grid, cv=5, scoring='f1_macro')
grid_dt.fit(X_train, y_train)
dt_best = grid_dt.best_estimator_
y_pred_dt = dt_best.predict(X_test)

print("Best Params for DT:", grid_dt.best_params_)
print("DT Classification Report:\n", classification_report(y_test, y_pred_dt))
print("DT Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

comparison = pd.DataFrame({
    'Model': ['Multinomial NB', 'KNN', 'Decision Tree'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_nb),
        accuracy_score(y_test, y_pred_knn),
        accuracy_score(y_test, y_pred_dt)
    ],
    'Macro Precision': [
        precision_score(y_test, y_pred_nb, average='macro'),
        precision_score(y_test, y_pred_knn, average='macro'),
        precision_score(y_test, y_pred_dt, average='macro'),
    ],
    'Macro Recall': [
        recall_score(y_test, y_pred_nb, average='macro'),
        recall_score(y_test, y_pred_knn, average='macro'),
        recall_score(y_test, y_pred_dt, average='macro'),
    ],
    'Macro F1': [
        f1_score(y_test, y_pred_nb, average='macro'),
        f1_score(y_test, y_pred_knn, average='macro'),
        f1_score(y_test, y_pred_dt, average='macro'),
    ]
})

print(comparison)
