In [None]:
import os
import gc
import math
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
import plotly.colors as pc

from itertools import combinations

from imblearn.over_sampling import SMOTE
from imblearn.ensemble import EasyEnsembleClassifier, BalancedRandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import (
    train_test_split, GridSearchCV, KFold, StratifiedKFold,
    cross_validate, TimeSeriesSplit
)

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix, make_scorer,
    mean_absolute_error, mean_squared_error, r2_score)


from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, VotingClassifier)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import optuna


In [None]:
df= pd.read_csv('Agrofood_co2_emission.csv')
df

In [None]:
df.info()

In [None]:
df = df.sort_values(['Area', 'Year']).reset_index(drop=True)

In [None]:
df['temp_lag1'] = df.groupby('Area')['Average Temperature °C'].shift(1)
df['emission_lag1'] = df.groupby('Area')['total_emission'].shift(1)

### Check for null values in each column

In [None]:
df.isnull().sum()

In [None]:
import pandas as pd, numpy as np
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = (
    df.groupby("Area")[num_cols]
      .transform(lambda g: g.fillna(method="ffill").fillna(method="bfill")))
df[num_cols] = (
    df.groupby("Area")[num_cols]
      .transform(lambda x: x.fillna(x.median())))
for col in num_cols:
    if df[col].isna().any():
        df[f"{col}_missing"] = df[col].isna().astype(int)
print(df.isnull().sum().sort_values(ascending=False).head(15))


In [None]:
lag_cols = ["temp_lag1", "emission_lag1",
            "temp_lag2", "emission_lag2",
            "temp_lag3", "emission_lag3",
            "temp_lag5", "emission_lag5"]

for col in num_cols:                   
    if col in lag_cols:
        continue                     
    if df[col].isna().any():
        miss_flag = f"{col}_missing"
        if miss_flag not in df.columns:
            df[miss_flag] = df[col].isna().astype(int)
        df[col] = df[col].fillna(df[col].median())
print(df.isnull().sum().sort_values(ascending=False).head(10))

### Check for duplicate rows

In [None]:
df.duplicated().sum()

In [None]:
df.describe().T

In [None]:
df[[ 'Area', 'Year', 'Savanna fires', 'Forest fires', 'Crop Residues',
       'Rice Cultivation', 'Drained organic soils (CO2)',
       'Pesticides Manufacturing', 'Food Transport', 'Forestland',
       'Net Forest conversion', 'Food Household Consumption', 'Food Retail',
       'On-farm Electricity Use', 'Food Packaging',
       'Agrifood Systems Waste Disposal', 'Food Processing',
       'Fertilizers Manufacturing', 'IPPU', 'Manure applied to Soils',
       'Manure left on Pasture', 'Manure Management', 'Fires in organic soils',
       'Fires in humid tropical forests', 'On-farm energy use',
       'Rural population', 'Urban population', 'Total Population - Male',
       'Total Population - Female', 'total_emission',
       'Average Temperature °C']].nunique()

In [None]:
df_encoded = pd.get_dummies(df)

plt.figure(figsize=(16, 10))
corr_matrix = df_encoded.corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap of Agrofood CO2 Emission Dataset')
plt.show()

In [None]:
df_numeric = df.select_dtypes(include=['int64', 'float64'])

plt.figure(figsize=(16, 10))
corr_matrix = df_numeric.corr()

sns.heatmap(corr_matrix, annot=False, cmap='cividis')
plt.title('Correlation Matrix Heatmap (Original Numeric Columns Only)')
plt.show()

In [None]:
df.groupby('Year')['total_emission'].sum().plot()

In [None]:
df.groupby('Area')['total_emission'].sum().sort_values(ascending=False).head(20).plot.bar()

In [None]:
sns.lineplot(data=df[(df['Area']=='Russian Federation')|(df['Area']=='China')],x='Year',y='total_emission',hue='Area')

In [None]:
sns.lineplot(data=df[(df['Area']=='Russian Federation')|(df['Area']=='China')],x='Year',y='Average Temperature °C',hue='Area')

In [None]:
df_2019=df[df['Year']==2019]
plt.figure(figsize=(20,12))
plt.axvline(x=0,color='red',lw=1,ls='--',alpha=1)
plt.axhline(y=df_2019['Average Temperature °C'].median(),color='red',lw=1,ls='--',alpha=1)
sns.scatterplot(data=df_2019,x='total_emission',y='Average Temperature °C')

In [None]:
df_plot = df.copy()
df_plot["Emission_level"] = pd.qcut(
    df_plot["total_emission"],
    q=4,
    labels=["Low", "Med‑Low", "Med‑High", "High"])
num_cols = df_plot.select_dtypes(include="number").columns.difference(["total_emission"])
sc_cols = 3
sc_rows = math.ceil(len(num_cols) / sc_cols)
fig_sc, axes_sc = plt.subplots(sc_rows, sc_cols, figsize=(sc_cols * 5, sc_rows * 4), dpi=110)
axes_sc = axes_sc.flatten()
for i, col in enumerate(num_cols):
    sns.scatterplot(
        data=df_plot, x=col, y="total_emission",
        hue="Emission_level", palette="Set2",
        s=25, alpha=0.6, edgecolor="none",
        ax=axes_sc[i], legend=False)
    axes_sc[i].set_title(f"{col} vs. Total Emission", fontsize=9)
    axes_sc[i].set_xlabel("")
    axes_sc[i].set_ylabel("")
handles, labels = axes_sc[0].get_legend_handles_labels()
fig_sc.legend(handles, labels, title="Emission level", loc="upper right", frameon=False)
for ax in axes_sc[len(num_cols):]:
    ax.set_visible(False)
plt.tight_layout()
plt.show()
hist_cols = 3
hist_rows = math.ceil(len(num_cols) / hist_cols)
fig_h, axes_h = plt.subplots(hist_rows, hist_cols, figsize=(hist_cols * 5, hist_rows * 3.5), dpi=110)
axes_h = axes_h.flatten()
for i, col in enumerate(num_cols):
    sns.histplot(df_plot[col].dropna(), kde=True, bins=30, ax=axes_h[i])
    axes_h[i].set_title(f"Distribution of {col}", fontsize=9)
    axes_h[i].set_xlabel("")
    axes_h[i].set_ylabel("")
for ax in axes_h[len(num_cols):]:
    ax.set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
features = df_plot.select_dtypes(include="number").columns.difference(["total_emission"])
n_cols = 3                              
n_rows = math.ceil(len(features) / n_cols)   
fig, axes = plt.subplots(
    n_rows, n_cols,
    figsize=(n_cols * 5, n_rows * 4),
    dpi=110,
    sharex=False, sharey=False)
axes = axes.flatten()
for i, col in enumerate(features):
    sns.histplot(
        data=df_plot,
        x=col,
        hue="Emission_level",
        kde=True,
        element="step",
        palette="Set2",
        ax=axes[i],
        legend=False      )
    axes[i].set_title(col, fontsize=9)
    axes[i].set_xlabel("")  
    axes[i].set_ylabel("") 
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, title="Emission level", loc="upper right", frameon=False)
for ax in axes[len(features):]:
    ax.set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df_plot[features.tolist() + ['Emission_level'] ])  
plt.show()

In [None]:
numeric_cols = df_plot.select_dtypes(include=['float64', 'int64']).columns.difference(
    ['total_emission', 'Average Temperature °C'])
n       = len(numeric_cols)
n_cols  = 3                       
n_rows  = math.ceil(n / n_cols)   
plt.figure(figsize=(6 * n_cols, 3.5 * n_rows), dpi=110)
for i, col in enumerate(numeric_cols, start=1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(
        x='Emission_level',
        y=col,
        data=df_plot,
        palette='Set2'
    )
    plt.title(f'{col} vs. Emission_level', fontsize=9)
    plt.xlabel('')
    plt.ylabel('')
    if i % n_cols == 1:
        plt.ylabel(col)
plt.tight_layout()
plt.show()


In [None]:
df = df.copy()
df['Emission_level'] = pd.qcut(df['total_emission'], 4,
                               labels=['Low','Med‑Low','Med‑High','High'])
numeric_cols = [c for c in df.columns if df[c].dtype in ('float64', 'int64')]
numeric_cols = [c for c in numeric_cols if c not in ['total_emission', 'Average Temperature °C']]
pairs = list(combinations(numeric_cols, 2))
n_cols, n_rows = 5, -(-len(pairs) // 5)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*4, n_rows*3))
axes = axes.flatten()
for i, (x, y) in enumerate(pairs):
    sns.scatterplot(df, x=x, y=y, hue='Emission_level',
                    palette='Set2', alpha=0.5, s=15, ax=axes[i])
    axes[i].set_title(f'{x} vs {y}', fontsize=8)
    if i:
        axes[i].legend_.remove()
for ax in axes[len(pairs):]:
    ax.set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
numeric_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns
                if col not in ['total_emission', 'Average Temperature °C']]  
cols_per_row = 2
rows = (len(numeric_cols) + cols_per_row - 1) // cols_per_row
fig, axes = plt.subplots(rows, cols_per_row,
                         figsize=(cols_per_row * 7, rows * 5))
axes = axes.flatten()
palette = sns.color_palette('pastel', 3) 
for i, col in enumerate(numeric_cols):
    data = df[col].dropna()
    if data.empty:
        axes[i].axis('off')
        continue
    cats = pd.cut(data, bins=3, labels=['Low', 'Medium', 'High'])
    counts = cats.value_counts().sort_index()
    perc   = counts / counts.sum() * 100
    wedges, _ = axes[i].pie(counts,
                            labels=None,
                            colors=palette,
                            startangle=90,
                            wedgeprops=dict(width=0.4))
    axes[i].set_title(f'{col} Distribution', fontsize=11)
    axes[i].axis('equal')
    legend_labels = [f'{cat}: {p:.1f}%' for cat, p in zip(counts.index, perc)]
    axes[i].legend(wedges, legend_labels,
                   title='Range',
                   loc='center left',
                   bbox_to_anchor=(1.05, 0.5),
                   frameon=False)
for ax in axes[len(numeric_cols):]:
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
mean_vals = df.groupby('Emission_level')[numeric_cols].mean().T

for col in mean_vals.columns:
    values = mean_vals[col].tolist()
    labels = mean_vals.index.tolist()

    values += [values[0]]
    labels += [labels[0]]

    fig = go.Figure()
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=labels,
        fill='toself',
        name=f'Emission Level {col}',
        marker=dict(color='#636EFA')
    ))

    fig.update_layout(
        polar=dict(
            radialaxis=dict(visible=True)
        ),
        title=f'Average Traits for Emission Level {col}',
        showlegend=True
    )

    fig.show()

In [None]:
warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)
df.columns = df.columns.str.replace(" ", "_")
df = df.dropna(subset=["Average_Temperature_°C", "total_emission"])
df = df.sort_values(["Area", "Year"]).reset_index(drop=True)
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df.groupby("Area")[num_cols].transform(lambda g: g.ffill().bfill().fillna(g.median()))
le = LabelEncoder()
df["Area_enc"] = le.fit_transform(df["Area"].astype(str))
for lag in [1, 2, 3, 5]:
    df[f"temp_lag{lag}"] = df.groupby("Area_enc")["Average_Temperature_°C"].shift(lag)
    df[f"emit_lag{lag}"] = df.groupby("Area_enc")["total_emission"].shift(lag)
df["temp_roll3"] = df.groupby("Area_enc")["Average_Temperature_°C"].rolling(3, min_periods=2).mean().reset_index(level=0, drop=True)
df = df.dropna().reset_index(drop=True)
tgt = "Average_Temperature_°C"
X = df.drop(columns=[tgt, "Area"])
y = df[tgt]
cat_feats = ["Area_enc"]
ts_split = TimeSeriesSplit(n_splits=4)

def tune_model(model_name):
    def objective(trial):
        if model_name == 'lgb':
            params = {
                'learning_rate': trial.suggest_float('lr', 0.01, 0.1, log=True),
                'num_leaves': trial.suggest_int('num_leaves', 31, 256),
                'max_depth': trial.suggest_int('max_depth', 4, 12),
                'feature_fraction': trial.suggest_float('ff', 0.6, 1.0),
                'bagging_fraction': trial.suggest_float('bf', 0.6, 1.0),
                'bagging_freq': 1,
                'objective': 'regression', 'metric': 'rmse', 'seed': 42
            }
        elif model_name == 'xgb':
            params = {
                'eta': trial.suggest_float('eta', 0.01, 0.1, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample', 0.6, 1.0),
                'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'seed': 42
            }
        else:
            params = {
                'learning_rate': trial.suggest_float('lr', 0.01, 0.1, log=True),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_int('l2', 1, 10),
                'random_seed': 42, 'verbose': False
            }
        rmses = []
        for tr_idx, va_idx in ts_split.split(X):
            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
            if model_name == 'lgb':
                dtr = lgb.Dataset(X_tr, y_tr, categorical_feature=cat_feats)
                dva = lgb.Dataset(X_va, y_va, categorical_feature=cat_feats, reference=dtr)
                model = lgb.train(params, dtr, valid_sets=[dva], callbacks=[lgb.early_stopping(100, verbose=False)])
                pred = model.predict(X_va)
            elif model_name == 'xgb':
                dtr = xgb.DMatrix(X_tr, y_tr)
                dva = xgb.DMatrix(X_va, y_va)
                model = xgb.train(params, dtr, num_boost_round=1000, evals=[(dva, 'val')], early_stopping_rounds=50, verbose_eval=False)
                pred = model.predict(dva)
            else:
                model = CatBoostRegressor(**params)
                model.fit(X_tr, y_tr, eval_set=(X_va, y_va), early_stopping_rounds=50, verbose=False)
                pred = model.predict(X_va)
            rmses.append(np.sqrt(mean_squared_error(y_va, pred)))
        return np.mean(rmses)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)
    return study.best_value, study.best_params
results = {}
for m in ['lgb', 'xgb', 'cat']:
    rmse, best = tune_model(m)
    results[m] = (rmse, best)
    print(f"{m} best RMSE: {rmse:.4f}")
plt.bar(results.keys(), [v[0] for v in results.values()])
plt.ylabel('CV RMSE')
plt.title('Model Comparison')
plt.show()


In [None]:
raw_best = results['cat'][1]       
best_cat_params = {
    'learning_rate': raw_best['lr'],
    'depth':         raw_best['depth'],
    'l2_leaf_reg':   raw_best['l2'],
    'random_seed':   42,
    'verbose':       False}
from catboost import CatBoostRegressor, Pool
final_cat = CatBoostRegressor(
    **best_cat_params,
    iterations=2000,
    early_stopping_rounds=100,
    allow_writing_files=False, 
    task_type='CPU'            )
final_cat.fit(
    X_tr, y_tr,
    cat_features=cat_feats,     
    eval_set=(X_te, y_te),
    use_best_model=True          )
y_tr_pred = final_cat.predict(X_tr)
y_te_pred = final_cat.predict(X_te)
metrics = {
    'MAE_train':  mean_absolute_error(y_tr, y_tr_pred),
    'RMSE_train': np.sqrt(mean_squared_error(y_tr, y_tr_pred)),
    'R2_train':   r2_score(y_tr, y_tr_pred),
    'MAE_test':   mean_absolute_error(y_te, y_te_pred),
    'RMSE_test':  np.sqrt(mean_squared_error(y_te, y_te_pred)),
    'R2_test':    r2_score(y_te, y_te_pred),}
print("\nPerformance Metrics:")
print(pd.Series(metrics).round(4))
plt.figure(figsize=(6,6))
plt.scatter(y_te, y_te_pred, alpha=0.5, label='Test')
plt.scatter(y_tr, y_tr_pred, alpha=0.2, label='Train')
min_v = min(y.min(), y_tr_pred.min(), y_te_pred.min())
max_v = max(y.max(), y_tr_pred.max(), y_te_pred.max())
plt.plot([min_v, max_v], [min_v, max_v], 'k--', linewidth=1)
plt.xlabel("Actual Temperature")
plt.ylabel("Predicted Temperature")
plt.title("Actual vs Predicted (Train & Test)")
plt.legend()
plt.tight_layout()
plt.show()
fi = pd.DataFrame({
    'feature': X_tr.columns,
    'importance': final_cat.get_feature_importance(type='FeatureImportance')}).sort_values('importance', ascending=False).head(20)
plt.figure(figsize=(6,4))
plt.barh(fi['feature'], fi['importance'])
plt.gca().invert_yaxis()
plt.title("Top 20 Feature Importances - CatBoost")
plt.tight_layout()
plt.show()
final_cat.save_model("final_cat_model.cbm")
print("\nModel saved → final_cat_model.cbm")

In [None]:
MODEL_PATH   = "final_cat_model.cbm"
RAW_DATA_CSV = "Agrofood_co2_emission.csv"  
model = CatBoostRegressor()
model.load_model(MODEL_PATH)
raw_cols = (
    pd.read_csv(RAW_DATA_CSV, nrows=0)
      .columns.str.replace(" ", "_")
      .tolist()
)

all_feats = list(model.feature_names_)
original_feats = [c for c in all_feats if c in raw_cols]
print(f"{len(original_feats)}\n")
print("")
for c in original_feats:
    print("-", c)


In [None]:
import pkg_resources

packages = [
    "streamlit",
    "pandas",
    "numpy",
    "matplotlib",
    "seaborn",
    "plotly",
    "imbalanced-learn",
    "scikit-learn",
    "xgboost",
    "lightgbm",
    "catboost",
    "joblib",
    "altair",
    "scipy",
    "python-dateutil",
    "statsmodels",
    "pyarrow",
    "openpyxl",
    "tqdm",
    "shap",
    "pillow",
    "typing-extensions",
    "pydeck",
    "validators",
    "watchdog",
    "gitpython",
    "pyyaml"
]

with open("requirements.txt", "w") as f:
    for package in packages:
        try:
            version = pkg_resources.get_distribution(package).version
            f.write(f"{package}=={version}\n")
        except pkg_resources.DistributionNotFound:
            print(f"[!] Package '{package}' not found. Skipping.")
