In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
     ---------------------------------------- 0.0/6.7 MB ? eta -:--:--
      --------------------------------------- 0.1/6.7 MB 2.2 MB/s eta 0:00:04
     - -------------------------------------- 0.3/6.7 MB 3.3 MB/s eta 0:00:02
     -- ------------------------------------- 0.4/6.7 MB 2.8 MB/s eta 0:00:03
     --- ------------------------------------ 0.7/6.7 MB 3.4 MB/s eta 0:00:02
     ---- ----------------------------------- 0.8/6.7 MB 3.4 MB/s eta 0:00:02
     ----- ---------------------------------- 0.9/6.7 MB 3.1 MB/s eta 0:00:02
     ----- ---------------------------------- 0.9/6.7 MB 2.8 MB/s eta 0:00:03
     ----- ---------------------------------- 1.0/6.7 MB 2.5 MB/s eta 0:00:03
     ------ --------------------------------- 1.1/6.7 MB 2.7 MB/s eta 0:00:03
     ------- -------------------------------- 1.2/6.7 MB 2.6 MB/s eta 0:00:03
     -------- ------------------------------- 1.4/6.7 MB 2.8 MB/s



In [2]:
pip install Dataset

Note: you may need to restart the kernel to use updated packages.Collecting Dataset
  Downloading dataset-1.6.0-py2.py3-none-any.whl (18 kB)
Collecting alembic>=0.6.2
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
     ---------------------------------------- 0.0/212.2 kB ? eta -:--:--
     -------------------- ----------------- 112.6/212.2 kB 6.8 MB/s eta 0:00:01
     ------------------------- ------------ 143.4/212.2 kB 1.7 MB/s eta 0:00:01
     -------------------------------------- 212.2/212.2 kB 1.6 MB/s eta 0:00:00
Collecting banal>=1.0.1
  Downloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2
  Downloading SQLAlchemy-1.4.46-cp39-cp39-win_amd64.whl (1.6 MB)
     ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
     -- ------------------------------------- 0.1/1.6 MB 2.2 MB/s eta 0:00:01
     -------- ------------------------------- 0.3/1.6 MB 3.4 MB/s eta 0:00:01
     ----------- ---------------------------- 0.5/1.6 MB 



In [2]:
pip install --upgrade datasets

Collecting datasetsNote: you may need to restart the kernel to use updated packages.





  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
     ---------------------------------------- 0.0/469.0 kB ? eta -:--:--
     ------- ------------------------------- 92.2/469.0 kB 2.6 MB/s eta 0:00:01
     -------------- ----------------------- 174.1/469.0 kB 1.7 MB/s eta 0:00:01
     ----------------------------- -------- 358.4/469.0 kB 2.5 MB/s eta 0:00:01
     --------------------------------- ---- 409.6/469.0 kB 2.3 MB/s eta 0:00:01
     -------------------------------------- 469.0/469.0 kB 2.3 MB/s eta 0:00:00
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2023.3.0-py3-none-any.whl (145 kB)
     ---------------------------------------- 0.0/145.4 kB ? eta -:--:--
     ---------------- ---------------------- 61.4/145.4 kB 3.2 MB/s eta 0:00:01
     -------------------------------------  143.4/145.4 kB 1.7 MB/s eta 0:00:01
     -------------------------------------- 145.4/145.4 kB 1.7 MB/s eta 0:00:00
Collecting pyarrow>=6.0.0
  Downloading pyarrow-11.0.0-cp39-cp39-w

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Preprocess text data with CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
train_data['text'] = train_data['text'].apply(lambda x: ' '.join(vectorizer.build_preprocessor()(x).split()))
test_data['text'] = test_data['text'].apply(lambda x: ' '.join(vectorizer.build_preprocessor()(x).split()))

# Encode labels
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])

# Split train_data into train and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

# Tokenize text data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_data(data, tokenizer):
    return tokenizer(data['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')

train_encodings = encode_data(train_data, tokenizer)
val_encodings = encode_data(val_data, tokenizer)
test_encodings = encode_data(test_data, tokenizer)

# Create PyTorch Dataset
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = NewsDataset(train_encodings, train_data['label'].tolist())
val_dataset = NewsDataset(val_encodings, val_data['label'].tolist())
test_dataset = NewsDataset(test_encodings)

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)

# Set up Trainer and TrainingArguments
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'f1_macro': f1_score(labels, predictions, average='macro')}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    save_steps=500,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Make predictions on test data
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Create submission file
submission = pd.DataFrame({'id': test_data['id'], 'label': label_encoder.inverse_transform(predicted_labels)})
submission.to_csv('submission.csv', index=False)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]