In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read Train CSV
X_train_preprocessed = pd.read_csv("X_train_preprocessed.csv", index_col='row_index')
y_train_preprocessed = pd.read_csv("y_train_preprocessed.csv", index_col='row_index')
X_test_preprocessed = pd.read_csv("X_test_preprocessed.csv", index_col='row_index')

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X_train_preprocessed, y_train_preprocessed, test_size=0.2, random_state=42, stratify=y_train_preprocessed)

In [4]:
import lightgbm as lgb
from sklearn.metrics import f1_score, accuracy_score

# Define hyperparameters for the LGBMClassifier
n_estimators = 200     # Number of boosting rounds (trees)
learning_rate = 0.015  # Step size shrinkage
max_depth = 20         # Maximum tree depth
subsample = 0.8        # Fraction of samples used for training each tree
colsample_bytree = 0.8 # Fraction of features used for each tree
random_state = 42      # Random state for reproducibility

# Initialize the LGBMClassifier with GPU support
model = lgb.LGBMClassifier(
    n_estimators=n_estimators,
    learning_rate=learning_rate,
    max_depth=max_depth,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    random_state=random_state,
    device='gpu',          # Enable GPU acceleration
    objective='multiclass',  # Set objective for multiclass classification
    num_class=3,           # Set the number of classes in your target variable
    metric='multi_logloss',  # Use multi-class log loss as the evaluation metric
    num_threads=-1         # Use all available threads for training
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_pred_train = model.predict(X_train)  # Predictions on training data
y_pred_test = model.predict(X_validation)  # Predictions on test data

# Evaluate the model on the training set
f1_train = f1_score(y_train, y_pred_train, average='weighted')
accuracy_train = accuracy_score(y_train, y_pred_train)

# Evaluate the model on the test set
f1_test = f1_score(y_validation, y_pred_test, average='weighted')
accuracy_test = accuracy_score(y_validation, y_pred_test)

# Print results
print("Training Performance:")
print(f"F1 Score (Train): {f1_train:.4f}")
print(f"Accuracy (Train): {accuracy_train:.4f}")

print("\nTesting Performance:")
print(f"F1 Score (Test): {f1_test:.4f}")
print(f"Accuracy (Test): {accuracy_test:.4f}")


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 8373
[LightGBM] [Info] Number of data points in the train set: 2264252, number of used features: 48
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 42 dense feature groups (95.01 MB) transferred to GPU in 0.084220 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -1.662003
[LightGBM] [Info] Start training from score -1.502564
[LightGBM] [Info] Start training from score -1.471996
[LightGBM] [Info] Start training from score -1.581638
[LightGBM] [Info] Start training from score -1.880079
Training Performance:
F1 Score (Train): 0.4608
Accuracy (Train): 0.4623

Testing Performance:
F1 Score (Test): 0.4588
Accuracy (Test): 0.4604


In [None]:
def encode_y(df):
    forward = {'Very Low': 0, 'Low': 1, 'Average': 2, 'High': 3, 'Very High': 4}
    backward = {v: k for k, v in forward.items()}
    df['piezo_groundwater_level_category'] = df['piezo_groundwater_level_category'].map(forward)
    return df

def decode_y(df):
    forward = {'Very Low': 0, 'Low': 1, 'Average': 2, 'High': 3, 'Very High': 4}
    backward = {v: k for k, v in forward.items()}
    df['piezo_groundwater_level_category'] = df['piezo_groundwater_level_category'].map(backward)
    return df

In [None]:
# Decode prediction to csv

predictions_df = pd.DataFrame(y_pred_test, index=X_test_preprocessed.index, columns=['piezo_groundwater_level_category'])
y_pred = decode_y(predictions_df)

# Save predictions to CSV
predictions_df.to_csv('predictionsLightGBM_XX.csv', index_label="row_index")
