In [1]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-win_amd64.whl (89.1 MB)
     ---------------------------------------- 0.0/89.1 MB ? eta -:--:--
     ---------------------------------------- 0.3/89.1 MB 10.6 MB/s eta 0:00:09
     ---------------------------------------- 0.8/89.1 MB 9.6 MB/s eta 0:00:10
      --------------------------------------- 1.1/89.1 MB 9.1 MB/s eta 0:00:10
      --------------------------------------- 1.5/89.1 MB 8.2 MB/s eta 0:00:11
      --------------------------------------- 1.9/89.1 MB 8.7 MB/s eta 0:00:10
     - -------------------------------------- 2.3/89.1 MB 8.1 MB/s eta 0:00:11
     - -------------------------------------- 2.7/89.1 MB 8.1 MB/s eta 0:00:11
     - -------------------------------------- 3.0/89.1 MB 7.9 MB/s eta 0:00:11
     - -------------------------------------- 3.4/89.1 MB 8.0 MB/s eta 0:00:11
     - -------------------------------------- 3.7/89.1 MB 8.2 MB/s eta 0:00:11
     - -------------------------------------- 3.8/89.1 MB



In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score

# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Split the data into training and validation sets
train_text = train_df['text']
train_label = train_df['label']
train_text, val_text, train_label, val_label = train_test_split(train_text, train_label, test_size=0.2, random_state=42)

# Preprocess the text data using CountVectorizer
vectorizer = CountVectorizer()
train_feature = vectorizer.fit_transform(train_text)
val_feature = vectorizer.transform(val_text)
test_feature = vectorizer.transform(test_df['text'])

# Define the model and train it on the training data
params = {
    'objective': 'multi:softmax',
    'num_class': 8,
    'max_depth': 6,
    'eta': 0.3,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1
}
dtrain = xgb.DMatrix(train_feature, label=train_label)
dval = xgb.DMatrix(val_feature, label=val_label)
model = xgb.train(params, dtrain, num_boost_round=1000, evals=[(dval, 'eval')], early_stopping_rounds=10, verbose_eval=50)

# Make predictions on the test data and save them to a CSV file
test_pred = model.predict(xgb.DMatrix(test_feature))
test_df['label'] = test_pred
test_df[['id', 'label']].to_csv('submission.csv', index=False)

[0]	eval-mlogloss:1.54705
[50]	eval-mlogloss:0.37543
[100]	eval-mlogloss:0.31992
[150]	eval-mlogloss:0.29708
[200]	eval-mlogloss:0.28441
[250]	eval-mlogloss:0.27790
[300]	eval-mlogloss:0.27316
[350]	eval-mlogloss:0.27090
[356]	eval-mlogloss:0.27080
