In [1]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.




In [3]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-win_amd64.whl (1.0 MB)
     ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
     --------------- ------------------------ 0.4/1.0 MB 12.9 MB/s eta 0:00:01
     ---------------------------------- ----- 0.9/1.0 MB 11.2 MB/s eta 0:00:01
     ---------------------------------------- 1.0/1.0 MB 9.3 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5
Note: you may need to restart the kernel to use updated packages.




In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import f1_score

# Load the training data
train_df = pd.read_csv('train.csv')

# Preprocess the text data using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(train_df['text'])
y = train_df['label']

# Split the data into training and validation sets
train_feature, val_feature, train_target, val_target = train_test_split(X, y, test_size=0.2)

# Convert the feature matrices to float64
train_feature = train_feature.astype('float64')
val_feature = val_feature.astype('float64')

# Train a LightGBM model
params = {
    'objective': 'multiclass',
    'num_class': 8,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt'
}

d_train = lgb.Dataset(train_feature, label=train_target)
d_val = lgb.Dataset(val_feature, label=val_target, reference=d_train)

model = lgb.train(params, d_train, valid_sets=[d_val], num_boost_round=100)

# Load the test data
test_df = pd.read_csv('test.csv')

# Preprocess the test data using the same vectorizer
test_feature = vectorizer.transform(test_df['text'])

# Convert the feature matrix to float64
test_feature = test_feature.astype('float64')

# Make predictions on the test data
y_pred = model.predict(test_feature)

# Convert the predicted probabilities to class labels
y_pred_labels = y_pred.argmax(axis=1)

# Evaluate the validation set using macro F1 score
val_pred_labels = model.predict(val_feature).argmax(axis=1)
val_f1_score = f1_score(val_target, val_pred_labels, average='macro')

# Print the validation F1 score
print("Validation F1 score:", val_f1_score)

# Save the submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'label': y_pred_labels})
submission_df.to_csv('submission.csv', index=False)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 131419
[LightGBM] [Info] Number of data points in the train set: 37919, number of used features: 22503
[LightGBM] [Info] Start training from score -1.212598
[LightGBM] [Info] Start training from score -1.464686
[LightGBM] [Info] Start training from score -1.617620
[LightGBM] [Info] Start training from score -1.664571
[LightGBM] [Info] Start training from score -2.958943
[LightGBM] [Info] Start training from score -3.827824
[LightGBM] [Info] Start training from score -5.140530
[LightGBM] [Info] Start training from score -5.449457
[1]	valid_0's multi_logloss: 1.33551
[2]	valid_0's multi_logloss: 1.18755
[3]	valid_0's multi_logloss: 1.07303
[4]	valid_0's multi_logloss: 0.983041
[5]	valid_0's multi_logloss: 0.910062
[6]	valid_0's multi_logloss: 0.848431
[7]	valid_0's multi_logloss: 0.796606
[8]	valid_0's multi_logloss: 0.752445
[9]	valid_0's multi_logloss: 0.714457
[10]	valid_0's multi_logloss: 0.6802
[1