In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ef-msu-2024-comp-2/sample_submission.csv
/kaggle/input/ef-msu-2024-comp-2/train.csv
/kaggle/input/ef-msu-2024-comp-2/test.csv


# Downloading data

In [2]:
train_data = pd.read_csv('/kaggle/input/ef-msu-2024-comp-2/train.csv')
test_data = pd.read_csv('/kaggle/input/ef-msu-2024-comp-2/test.csv')
sample_submission = pd.read_csv('/kaggle/input/ef-msu-2024-comp-2/sample_submission.csv')
preprocessed_train_data = train_data.copy()
preprocessed_train_data.Review = preprocessed_train_data.Review.fillna('')
preprocessed_test_data = test_data.copy()
preprocessed_test_data.Review = preprocessed_test_data.Review.fillna('')
preprocessed_test_data = preprocessed_test_data.drop('id', axis=1)

# Import specific libraries

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from sentence_transformers import SentenceTransformer

# Feature engineering

In [None]:
model = SentenceTransformer('nli-roberta-base-v2')
sentences = [i for i in preprocessed_train_data.Review]
embeddings = model.encode(sentences)

# Split the data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, 
                                                    preprocessed_train_data.label, 
                                                    test_size=0.3, 
                                                    stratify=preprocessed_train_data.label,
                                                    random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)

# LightGBM

## Bayesian optimization

In [8]:
def objective(trial):

    param = {
        "objective": "binary",
        "metric": "AUC",
        "verbosity": -1,
        "n_jobs": -1,
        "random_state": 42,
        "is_unbalance": True,
        "subsample": trial.suggest_float("subsample", 0.01, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        "boosting_type": 'gbdt',
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 10, 100),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 0.9),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.8, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_depth": trial.suggest_int('max_depth', 5, 30),
        'max_bin': trial.suggest_int('max_bin', 100,200),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
    }
    
    dtrain = lgb.Dataset(X_train, label=y_train)

    gbm = lgb.train(param, dtrain)
    y_pred = gbm.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    return auc

In [9]:
sampler = TPESampler(n_startup_trials=10, seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=25)

In [10]:
parameters = study.best_params
parameters['objective'] = "binary"
parameters['metric'] = "auc"
parameters['verbosity'] = -1
parameters['n_jobs'] = -1
parameters['random_state'] = 42
parameters['is_unbalance'] = True

result:

In [11]:
# parameters = {
#  'subsample': 0.14738224003629247,
#  'learning_rate': 0.08178932526821595,
#  'lambda_l1': 5.496552066193578e-06,
#  'lambda_l2': 1.0135663854176003e-08,
#  'num_leaves': 69,
#  'feature_fraction': 0.2637013343359327,
#  'bagging_fraction': 0.9748303281684237,
#  'bagging_freq': 5,
#  'min_child_samples': 23,
#  'max_depth': 18,
#  'max_bin': 187,
#  'min_data_in_leaf': 87,
#  'objective': 'binary',
#  'metric': 'auc',
#  'verbosity': -1,
#  'n_jobs': -1,
#  'random_state': 42,
#  'is_unbalance': True
# }

### Validation check after Bayesian optimization

In [12]:
# dtrain = lgb.Dataset(X_train, label=y_train)

# lgbm_model_train = lgb.train(params=parameters, 
#                             train_set=dtrain
#                           )
# y_pred_lgb = lgbm_model_train.predict(X_val)
# roc_auc_score(y_val, y_pred_lgb)

In [13]:
# y_pred_test_lgb = lgbm_model_train.predict(X_test)
# roc_auc_score(y_test, y_pred_test_lgb)

### Final learning after Bayesian optimization

In [None]:
new_sentences = [i for i in preprocessed_test_data.Review]
new_embeddings = model.encode(new_sentences)

In [15]:
dtrain = lgb.Dataset(embeddings, label=preprocessed_train_data.label)

lgbm_model_test = lgb.train(params=parameters, 
                            train_set=dtrain
                          )
y_pred_lgb = lgbm_model_test.predict(new_embeddings)

# Sample submission

In [16]:
submission = pd.DataFrame()
submission["id"] = test_data.id
submission["sentiment"] = y_pred_lgb
submission

Unnamed: 0,id,sentiment
0,0,0.052825
1,1,0.923337
2,2,0.169880
3,3,0.997365
4,4,0.004642
...,...,...
26346,26346,0.004381
26347,26347,0.019261
26348,26348,0.926437
26349,26349,0.979679


In [17]:
submission.to_csv("submission_baseline.csv", index=False)