<a href="https://colab.research.google.com/github/Nebil1/UNDP-FTL-AI/blob/main/Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Install & Import Libraries

In [29]:
!pip install pandas scikit-learn matplotlib seaborn



In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, classification_report, confusion_matrix)

##Load the Data

In [31]:
file_url = "https://drive.google.com/uc?id=1zIk9JOdJEu9YF7Xuv2C8f2Q8ySfG3nHd"
df = pd.read_csv(file_url)

In [32]:
print("Starting shape:", df.shape)
df.head()

Starting shape: (165, 14)


Unnamed: 0,Country or Administrative area,Area [km2],Coast length [km],Rainfall [mm year -1],Factor L/A [-],Factor (L/A) *P [-],P[E] [%],MPW (metric tons year -1),M[E] (metric tons year -1),Ratio Me/MPW,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,Albania,28'486,362,1'117,0.01,14.0,1.56%,69'833,1'565,2.24%,,,,
1,Algeria,2'316'559,998,80,0.0004,0.0,0.09%,764'578,5'774,0.76%,,,,
2,Angola,1'247'357,1'600,1'025,0.001,1.0,0.09%,236'946,860,0.36%,,,,
3,Antigua and Barbuda,443,153,996,0.3,344.0,3.08%,627,2,0.29%,,,,
4,Argentina,2'779'705,4'989,567,0.002,1.0,0.26%,465'808,4'137,0.89%,,,,


Count missing per column

In [33]:
missing_counts = df.isna().sum()
print(missing_counts)

Country or Administrative area      2
Area [km2]                          2
Coast length [km]                   2
Rainfall [mm year -1]               2
Factor L/A [-]                      2
Factor (L/A) *P [-]                 2
P[E] [%]                            2
MPW (metric tons year -1)           2
M[E] (metric tons year -1)          2
Ratio Me/MPW                        2
Unnamed: 10                       165
Unnamed: 11                       165
Unnamed: 12                       165
Unnamed: 13                       165
dtype: int64


## Drop Completely Empty Columns, Four columns are entirely NaN

In [34]:
empty_cols = ['Unnamed: 10','Unnamed: 11','Unnamed: 12','Unnamed: 13']
df = df.drop(columns=empty_cols)
print("After dropping empty columns:", df.shape)

After dropping empty columns: (165, 10)


##remove anything except digits, decimal point, or minus sign

In [35]:
# Helper to strip symbols → float
def clean_numeric(s):
    return (s.astype(str)
             .str.replace(r'[^0-9.\-]', '', regex=True)
             .replace('', np.nan)
             .astype(float))

In [36]:
# Clean columns with units/%
for c in ['P[E] [%]', 'Ratio Me/MPW', 'Area [km2]', 'Coast length [km]', 'Rainfall [mm year-1]']:
    if c in df: df[c] = clean_numeric(df[c])

##Convert & Impute the Main Target Column

In [None]:
# 4) Convert & impute main target
col = 'M[E] (metric tons year -1)'
df[col] = pd.to_numeric(df[col], errors='coerce')
df[col].fillna(df[col].mean(), inplace=True)

##Label Creation

In [None]:
df['plastic_contribution'] = (df[col] <= 6008).astype(int)

##Features & target

In [40]:
X = df.drop([col, 'plastic_contribution', 'Country or Administrative area'], axis=1)\
       .select_dtypes(include=[np.number])
y = df['plastic_contribution']

KeyError: "['plastic_contribution'] not found in axis"

##Train/Test Split
- Split X and y into training (80%) and test (20%) sets,
- Splits your dataset so the model only “sees” 80% of the data during training.
- The remaining 20% is kept completely unseen until evaluation.
- Keeps the proportion of high-/low-polluter examples the same in both train and test sets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,                # all your features
    y,                # your labels (0/1)
    test_size=0.2,    # 20% held out for testing
    random_state=42,  # fixed seed for reproducibility
    stratify=y        # maintain class balance
)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test  shape:", X_test.shape)
print("y_train balance:\n", y_train.value_counts(normalize=True))
print("y_test  balance:\n", y_test.value_counts(normalize=True))

##Feature Scaling

In [None]:
# Create a scaler object
scaler = StandardScaler()

# 1) Fit on training data and transform it:
X_train_scaled = scaler.fit_transform(X_train)

# 2) Transform test data using the same scaler:
X_test_scaled  = scaler.transform(X_test)

In [None]:
print("Feature means (train):", X_train_scaled.mean(axis=0))
print("Feature stds  (train):", X_train_scaled.std(axis=0))