In [1]:
import pandas as pd
import numpy as np
import logging
import warnings
warnings.filterwarnings('ignore')
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import optuna
from sklearn.metrics import cohen_kappa_score



# Set up directories
log_dir = "/kaggle/working/logs"
model_dir = "/kaggle/working/models"

os.makedirs(log_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

# Logging setup
log_file_path = os.path.join(log_dir, "training_LightGBM.log")

logging.basicConfig(
    filename=log_file_path,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger()

logger.info("Logging and saving models example initialized.")

In [2]:
# Load data
df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
test = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")

In [3]:
# Text vectorization
vec = TfidfVectorizer(
    ngram_range=(1, 6),
    min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
    analyzer='char', stop_words='english', smooth_idf=1,
    sublinear_tf=1, max_features=50000
)

X = vec.fit_transform(df['full_text'])
features = vec.transform(test['full_text'])




# Map scores to labels
df['label'] = df['score'].map(lambda x: x - 1)
y = df.label

In [4]:
# Train-test split
SEED = 42
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y, test_size=0.2, random_state=SEED)

# Create LightGBM datasets
train_data = lgb.Dataset(train_X, label=train_y)
valid_data = lgb.Dataset(test_X, label=test_y, reference=train_data)

In [5]:
# Hyperparameter optimization with Optuna
logger.info("Training started...")

def objective(trial):
    params = {
        'objective': 'multiclass',
        'num_class': 6,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'device_type': 'gpu', 
        'num_leaves': trial.suggest_int('num_leaves', 31, 35),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'seed': SEED,
        'deterministic': True
    }
    
    try:
        model = lgb.train(
            params,
            train_data,
            num_boost_round=200,
            valid_sets=[valid_data],
            callbacks=[lgb.early_stopping(stopping_rounds=10)]
        )
        
        y_pred = model.predict(test_X, num_iteration=model.best_iteration)
        y_pred = np.argmax(y_pred, axis=1)
        score = cohen_kappa_score(test_y, y_pred, weights='quadratic')
        
        logger.info(f"Trial {trial.number}: Params: {params} | Score: {score:.4f}")
        return score
    except Exception as e:
        logger.error(f"Trial {trial.number} failed with exception: {e}")
        return float('-inf')

# Run Optuna study
optuna.logging.set_verbosity(optuna.logging.WARNING)  # Suppress excessive logs
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5) 

# Log best results
best_score = study.best_value
best_params = study.best_params
logger.info(f"Best trial score: {best_score:.4f}")
logger.info(f"Best parameters: {best_params}")
print(f"Best trial score: {best_score}")
print(f"Best parameters: {best_params}")

# Train final model with best parameters
best_params['objective'] = 'multiclass'
best_params['num_class'] = 6
best_params['metric'] = 'multi_logloss'
best_params['device_type'] = 'gpu'  # Ensure GPU usage
best_params['seed'] = SEED
best_params['deterministic'] = True

model = lgb.train(
    best_params,
    train_data,
    num_boost_round=200,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 8945045
[LightGBM] [Info] Number of data points in the train set: 13845, number of used features: 50000
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 1445 dense feature groups (19.12 MB) transferred to GPU in 0.016031 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -2.626925
[LightGBM] [Info] Start training from score -1.298729
[LightGBM] [Info] Start training from score -1.013698
[LightGBM] [Info] Start training from score -1.483383
[LightGBM] [Info] Start training from score -2.881527
[LightGBM] [Info] Start training from score -4.707366
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[54]	valid_0's multi_logloss: 0.983733
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 8945045
[LightGBM] [Info] Number of data points in the train set: 13845, number of used features: 50000
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGB

In [6]:
# Save model
model_file_path = os.path.join(model_dir, "lightgbm_model.txt")

model.save_model(model_file_path)
logger.info(f"Model saved to: {model_file_path}")


# Load the model from the saved file
# loaded_model = lgb.Booster(model_file="/kaggle/working/lightgbm_model.txt")
# print("Model loaded successfully.")

In [7]:
# Evaluate final model and Submit
y_pred = model.predict(test_X, num_iteration=model.best_iteration)
y_pred = np.argmax(y_pred, axis=1)
final_score = cohen_kappa_score(test_y, y_pred, weights='quadratic')
print("Final Cohen's Kappa Score:", final_score)

# Generate predictions for test set
test_pred = model.predict(features, num_iteration=model.best_iteration)
test_pred = np.argmax(test_pred, axis=1)
test['score'] = test_pred + 1  # Reverse label mapping

# Save submission file
test[['essay_id', 'score']].to_csv('submission.csv', index=False)

Final Cohen's Kappa Score: 0.7241582888082739
