<a href="https://colab.research.google.com/github/Nebil1/UNDP-FTL-AI/blob/main/Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Install & Import Libraries

In [2]:
!pip install pandas scikit-learn matplotlib seaborn



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, classification_report, confusion_matrix)

##Load the Data

In [4]:
file_url = "https://drive.google.com/uc?id=1zIk9JOdJEu9YF7Xuv2C8f2Q8ySfG3nHd"
df = pd.read_csv(file_url)

In [5]:
print("Starting shape:", df.shape)
df.head()

Starting shape: (165, 14)


Unnamed: 0,Country or Administrative area,Area [km2],Coast length [km],Rainfall [mm year -1],Factor L/A [-],Factor (L/A) *P [-],P[E] [%],MPW (metric tons year -1),M[E] (metric tons year -1),Ratio Me/MPW,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,Albania,28'486,362,1'117,0.01,14.0,1.56%,69'833,1'565,2.24%,,,,
1,Algeria,2'316'559,998,80,0.0004,0.0,0.09%,764'578,5'774,0.76%,,,,
2,Angola,1'247'357,1'600,1'025,0.001,1.0,0.09%,236'946,860,0.36%,,,,
3,Antigua and Barbuda,443,153,996,0.3,344.0,3.08%,627,2,0.29%,,,,
4,Argentina,2'779'705,4'989,567,0.002,1.0,0.26%,465'808,4'137,0.89%,,,,


Count missing per column

In [6]:
missing_counts = df.isna().sum()
print(missing_counts)

Country or Administrative area      2
Area [km2]                          2
Coast length [km]                   2
Rainfall [mm year -1]               2
Factor L/A [-]                      2
Factor (L/A) *P [-]                 2
P[E] [%]                            2
MPW (metric tons year -1)           2
M[E] (metric tons year -1)          2
Ratio Me/MPW                        2
Unnamed: 10                       165
Unnamed: 11                       165
Unnamed: 12                       165
Unnamed: 13                       165
dtype: int64


## Drop empty “Unnamed” columns


In [7]:
empty_cols = ['Unnamed: 10','Unnamed: 11','Unnamed: 12','Unnamed: 13']
df = df.drop(columns=empty_cols)
print("After dropping empty columns:", df.shape)

After dropping empty columns: (165, 10)


##Remove anything except digits, decimal point, or minus sign

In [8]:
# Helper to strip symbols → float
def clean_numeric(s):
    return (s.astype(str)
             .str.replace(r'[^0-9.\-]', '', regex=True)
             .replace('', np.nan)
             .astype(float))

In [9]:
# Clean columns with units/%
for c in ['P[E] [%]', 'Ratio Me/MPW', 'Area [km2]', 'Coast length [km]', 'Rainfall [mm year-1]', 'MPW (metric tons year -1)', 'Rainfall [mm year -1]']:
    if c in df: df[c] = clean_numeric(df[c])

##Convert & Impute the Main Target Column

In [10]:
# 4) Convert & impute main target
col = 'M[E] (metric tons year -1)'
df[col] = pd.to_numeric(df[col], errors='coerce')
df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


Impute every remaining NaN with the 5-row rolling mean

In [11]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
for c in numeric_cols:
    roll = df[c].rolling(window=5, center=True, min_periods=1).mean()
    df[c].fillna(roll, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(roll, inplace=True)


Create a two‐class label using the median instead of a fixed 6008 cutoff

In [12]:
# 6) Create a two‐class label using the median instead of a fixed 6008 cutoff
col        = 'M[E] (metric tons year -1)'
median_val = df[col].median()
print("Median plastic load:", median_val)

# Use > median → 1 (high polluter), ≤ median → 0 (low polluter)
df['plastic_contribution'] = (df[col] > median_val).astype(int)

# Verify you now have two classes
print(df['plastic_contribution'].value_counts())

Median plastic load: 188.135593220339
plastic_contribution
0    126
1     39
Name: count, dtype: int64


##Label Creation

In [13]:
df['plastic_contribution'] = (df[col] <= 6008).astype(int)

##Features & target

In [14]:
X = df.drop([col, 'plastic_contribution', 'Country or Administrative area'], axis=1)\
       .select_dtypes(include=[np.number])
y = df['plastic_contribution']

##Split data to Train/Test set
- Split X and y into training (80%) and test (20%) sets,


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X,                # all your features
    y,                # your labels (0/1)
    test_size=0.2,    # 20% held out for testing
    random_state=42,  # fixed seed for reproducibility
    stratify=y        # maintain class balance
)

In [16]:
print("X_train shape:", X_train.shape)
print("X_test  shape:", X_test.shape)
print("y_train balance:\n", y_train.value_counts(normalize=True))
print("y_test  balance:\n", y_test.value_counts(normalize=True))

X_train shape: (132, 8)
X_test  shape: (33, 8)
y_train balance:
 plastic_contribution
1    1.0
Name: proportion, dtype: float64
y_test  balance:
 plastic_contribution
1    1.0
Name: proportion, dtype: float64


##Feature Scaling
- .fit_transform(X_train) computes each feature’s mean & standard deviation on the training data and scales it (mean→0, std→1)
- .transform(X_test) applies t

- Prevents features with large numeric ranges from dominating the model.
- Ensures your test set is scaled consistently with the training set.

In [17]:
# Create a scaler object
scaler = StandardScaler()

# Fit on training data and transform it:
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data using the same scaler:
X_test_scaled  = scaler.transform(X_test)

In [18]:
print("Feature means (train):", X_train_scaled.mean(axis=0))
print("Feature stds  (train):", X_train_scaled.std(axis=0))

Feature means (train): [ 1.51394049e-17  1.00929366e-17  3.02788098e-17  0.00000000e+00
  3.36431220e-17  3.36431220e-17  1.34572488e-17 -5.38289951e-17]
Feature stds  (train): [1. 1. 1. 1. 1. 1. 1. 1.]


In [19]:
print(df['plastic_contribution'].value_counts())

plastic_contribution
1    165
Name: count, dtype: int64


##Training the Model
- .fit(X_train_scaled, y_train)
    - Tells the algorithm to learn the relationship between your input features (X_train_scaled) and their labels (y_train).
    - After this step, model has “seen” the training examples and stored the learned coefficients.

In [20]:
# 1) Initialize the model
model = LogisticRegression(random_state=42)

# 2) Train (fit) on the scaled training data
model.fit(X_train_scaled, y_train)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)

##Evaluating the Model

Use the trained model to make predictions on unseen test data

In [None]:
y_pred = model.predict(X_test_scaled)

Calculate metrics comparing predictions to the true labels

In [None]:
acc   = accuracy_score(y_test, y_pred)    # overall correctness
prec  = precision_score(y_test, y_pred)   # when it predicts “low polluter,” how often it’s right
rec   = recall_score(y_test, y_pred)      # of all true “low polluter” rivers, how many we caught

visualize

In [None]:
print(f"Accuracy : {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall   : {rec:.2f}")

##Detailed breakdown

In [None]:
print("\nFull report:\n", classification_report(y_test, y_pred))

In [None]:
df.head()

In [None]:
plt.figure()
counts = df['plastic_contribution'].value_counts().sort_index()
plt.bar(counts.index.astype(str), counts.values)
plt.xlabel('Plastic Contribution (0=Low Polluter, 1=High Polluter)')
plt.ylabel('Count')
plt.title('Count of Rivers by Plastic Contribution')
plt.show()

In [None]:
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
plt.figure()
plt.imshow(cm)
for (i, j), val in np.ndenumerate(cm):
    plt.text(j, i, str(val), ha='center', va='center')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# 3) ROC Curve & AUC
y_score = model.predict_proba(X_test_scaled)[:,1]  # probability of class “1” (Low Polluter)
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0,1], [0,1], 'k--', label='Random Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()