In [None]:
# Mount Google Drive to the Colab environment for accessing/saving files directly to/from Drive.
from google.colab import drive

drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
import pandas as pd
import lightgbm as lgb
from hyperopt import fmin, hp, STATUS_OK, tpe, Trials
import csv
import pickle
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
import os
import random

In [None]:
# TRAINING

# Setting the seed for reproducibility

seed = 42

np.random.seed(seed)
random.seed(seed)
rstate = np.random.default_rng(seed)

# Set the working directory to the specified path

# Change the current working directory to the specified path

os.chdir("/content/gdrive/MyDrive") # Change to your desired directory path

# Define the relative path where data is stored

path = "GEO-AI Challenge for Cropland Mapping by ITU/" # Change to your desired directory path

# Define variable names for feature extraction

variables = ['DATT1', 'IRECI', 'NBR2', 'NDRE', 'NDVI', 'NDWI']

# Create expected column names

exp_cols = [f"{i:02d}_{suffix}" for i in range(36) for suffix in variables]

def preprocess_dataset(filename):

    """
    Load and preprocess the dataset.
    1. Reads the CSV file.
    2. Drops unwanted columns.
    3. Adds missing columns with NaN values.

    Args:
    - filename (str): Name of the CSV file to read.

    Returns:
    - DataFrame: Preprocessed data.
    """

    data = pd.read_csv(path + filename)
    data = data.drop(columns=["system:index", "COUNTRY", "ID", "Lat", "Lon", ".geo"])

    # Identify and fill missing columns with NaN values

    miss_cols = [col for col in exp_cols if col not in data.columns]
    for col in miss_cols:
        data[col] = np.nan

    return data

def order_columns(data):

    """
    Order columns of the DataFrame based on the predefined variables.

    Args:
    - data (DataFrame): The input data.

    Returns:
    - DataFrame: Data with columns reordered.
   """

    target = data["Target"]

    ordered_columns = []
    for variable in variables:
        variable_cols = sorted([col for col in data.columns if variable in col])
        ordered_columns.extend(variable_cols)
    data = data[ordered_columns]
    data["Target"] = target

    return data

# Load and preprocess the Iran and Sudan dataset

fail_iran_sudan = preprocess_dataset("S2_TRAIN_MULTI_INDICES_IRAN_SUDAN.csv")
fail_iran_sudan = order_columns(fail_iran_sudan)

# Load and preprocess the Afghanistan dataset

fail_afghanistan = preprocess_dataset("S2_TRAIN_MULTI_INDICES_AFGHANISTAN.csv")
fail_afghanistan = order_columns(fail_afghanistan)

# Combine the two datasets

fail = pd.concat([fail_iran_sudan, fail_afghanistan])

# Function to create an output file for storing run results

def outputs_run_fun():

    output = path + "OUTPUTS.CSV"

    open_output = open(output,"w",newline = "")

    write_output = csv.writer(open_output)

    write_output.writerow(["ITERATION","F1_SCORE_TRAIN","F1_SCORE_TEST","num_leaves","learning_rate","max_depth","min_child_samples",
                       "colsample_bytree","subsample","n_estimators","reg_alpha","reg_lambda","subsample_for_bin"])

    open_output.close()

    return output

# Define the hyperparameter search space

space = {
    'num_leaves': hp.quniform('num_leaves', 5, 50,1),
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'max_depth': hp.quniform('max_depth', 2, 10,1),
    'min_child_samples': hp.quniform('min_child_samples', 5, 30,1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 0.75),
    'subsample': hp.uniform('subsample', 0.1, 0.75),
    'n_estimators': hp.quniform('n_estimators', 10, 100,1),
    "reg_alpha": hp.uniform("reg_alpha",0,1),
    "reg_lambda": hp.uniform("reg_lambda",0,1),
    "subsample_for_bin": hp.quniform("subsample_for_bin",10,50,1)
}

# Define the objective function for hyperparameter optimization

def objective_fun(params):

    global iteration

    iteration += 1

    # Convert hyperparameters to their appropriate data types

    params = {
        'num_leaves': int(params['num_leaves']),
        'learning_rate': params['learning_rate'],
        'max_depth': int(params['max_depth']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': params['colsample_bytree'],
        'subsample': params['subsample'],
        'n_estimators': int(params['n_estimators']),
        "reg_alpha": params["reg_alpha"],
        "reg_lambda": params["reg_lambda"],
        "subsample_for_bin": int(params["subsample_for_bin"])
    }

    # Initialize LightGBM model with the given parameters

    model = lgb.LGBMClassifier(**params, random_state = seed, verbose = -1)

    # Define a stratified K-fold cross-validation

    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)

    train_f1_scores = []
    test_f1_scores = []

    # Perform cross-validation

    for train_idx, test_idx in skf.split(fail.drop("Target",axis=1), fail.Target):

        x_train, y_train = fail.iloc[train_idx].drop("Target", axis=1), fail.iloc[train_idx].Target
        x_test, y_test = fail.iloc[test_idx].drop("Target", axis=1), fail.iloc[test_idx].Target

        # Train the model

        model.fit(x_train, y_train)

        # Calculate F1-score for training set

        train_predict = model.predict(x_train)
        train_f1_scores.append(f1_score(y_train, train_predict, average="weighted"))

        # Calculate F1-score for test set

        test_predict = model.predict(x_test)
        test_f1_scores.append(f1_score(y_test, test_predict, average="weighted"))

    # Calculate average F1-scores for training and testing sets

    avg_train_f1_score = sum(train_f1_scores) / len(train_f1_scores)
    avg_test_f1_score = sum(test_f1_scores) / len(test_f1_scores)

    # Log the results and hyperparameters into a CSV file

    open_output = open(output,"a",newline = "")

    write_output = csv.writer(open_output)

    write_output.writerow([iteration,avg_train_f1_score,avg_test_f1_score,params["num_leaves"],params["learning_rate"],params["max_depth"],
                       params["min_child_samples"],params["colsample_bytree"],params["subsample"],
                       params["n_estimators"],params["reg_alpha"],params["reg_lambda"],params["subsample_for_bin"]])

    return {"loss":-avg_test_f1_score,"status":STATUS_OK}

# Function to run hyperparameter optimization

def runs_fun(evals_num):

    global output
    global iteration

    # Initialize output file to log results

    output = outputs_run_fun()

    iteration = 0

    # Run hyperparameter optimization

    best_params = fmin(fn = objective_fun,space = space,algo = tpe.suggest, max_evals = evals_num, trials = Trials(), rstate = rstate)

    return best_params

# Run the optimization for a given number of evaluations

best_params = runs_fun(2000)

100%|██████████| 2000/2000 [21:27<00:00,  1.55trial/s, best loss: -0.9393250724332699]


In [None]:
# Convert some of the hyperparameters to integer type

best_params["max_depth"] = int(best_params["max_depth"])
best_params["min_child_samples"] = int(best_params["min_child_samples"])
best_params["n_estimators"] = int(best_params["n_estimators"])
best_params["num_leaves"] = int(best_params["num_leaves"])
best_params["subsample_for_bin"] = int(best_params["subsample_for_bin"])

# Initialize the best LightGBM model with optimized hyperparameters

best_model = lgb.LGBMClassifier(**best_params, random_state = seed, verbose = -1)

# Split the data into features and target variable

x_train, y_train = fail.drop("Target",axis=1), fail.Target

# Train the best model on the entire dataset

best_model.fit(x_train,y_train)

# Save the trained model to a file

with open(path+"BEST_MODEL_MULTI_INDICE_LGBM.pkl","wb") as f:
    pickle.dump(best_model,f)

In [None]:
# PREDICTING

def preprocess_data(path, filename):

    """Loads the dataset, drops unwanted columns, and orders the columns based on specified variables."""

    data = pd.read_csv(path + filename)
    ids = data.ID
    drop_cols = ["system:index", "COUNTRY", "Lat", "Lon", ".geo", "ID"]
    data = data.drop(columns=drop_cols)

    miss_cols = [col for col in exp_cols if col not in data.columns]
    for col in miss_cols:
        data[col] = np.nan

    ordered_columns = [col for variable in variables for col in sorted(data.columns) if variable in col]
    data = data[ordered_columns]

    return data, ids

# Define columns and variables for preprocessing

variables = ['DATT1', 'IRECI', 'NBR2', 'NDRE', 'NDVI', 'NDWI']

exp_cols = [f"{i:02d}_{suffix}" for i in range(36) for suffix in variables]

# Preprocess Iran-Sudan data

fail_iran_sudan, IDs_iran_sudan = preprocess_data(path, "S2_TEST_MULTI_INDICES_IRAN_SUDAN.csv")

iran_sudan_pred = pd.DataFrame(best_model.predict(fail_iran_sudan), columns=["Target"])

iran_sudan_pred["ID"] = IDs_iran_sudan

# Preprocess Afghanistan data

fail_afghanistan, IDs_afghanistan = preprocess_data(path, "S2_TEST_MULTI_INDICES_AFGHANISTAN.csv")

afghanistan_pred = pd.DataFrame(best_model.predict(fail_afghanistan), columns=["Target"])

afghanistan_pred["ID"] = IDs_afghanistan

# Concatenate predictions and merge with sample submission

predictions = pd.concat([iran_sudan_pred, afghanistan_pred])

submission = pd.read_csv(path + "SampleSubmission.csv")

sub_merged = submission.merge(predictions, on="ID", how="left").drop("Target_x", axis=1).rename(columns={"Target_y": "Target"})

sub_merged.to_csv(path + "Submission_MULTI_INDICES_LGBM_SKF.csv", index=False)