In [None]:
!pip install transformers torch
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
# Load data
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
# Fill missing values
train.fillna('unknown', inplace=True)
test.fillna('unknown', inplace=True)

In [4]:
# Combine 'keyword', 'location', and 'text' into one column
train['text'] = train['keyword'] + " " + train['location'] + " " + train['text']
test['text'] = test['keyword'] + " " + test['location'] + " " + test['text']

In [None]:
# Preparing for training
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [7]:
# Tokenize the data
train_encodings = tokenize_function(train.to_dict(orient='list'))
test_encodings = tokenize_function(test.to_dict(orient='list'))

In [8]:
# Dataset class
class TweetDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:  # Changed from 'if self.labels:'
            item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [9]:
# Prepare datasets
train_dataset = TweetDataset(train_encodings, train['target'].values)
test_dataset = TweetDataset(test_encodings)

In [10]:
# Train/validation split
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

In [None]:
# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [12]:
# Training parameters
batch_size = 16
learning_rate = 2e-5
epochs = 3

In [13]:
# Data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
# Optimizer & scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
model.to(device)

In [16]:
# Training loop
model.train()
for epoch in range(epochs):
    for batch in train_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        
        # Compute loss
        loss = outputs.loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

In [17]:
# Evaluation
model.eval()
val_predictions, val_true = [], []
with torch.no_grad():
    for batch in val_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch['labels']
        
        # Forward pass
        outputs = model(**batch)
        
        # Compute predictions
        preds = torch.argmax(outputs.logits, dim=-1)
        val_predictions.extend(preds.cpu().numpy())
        val_true.extend(labels.cpu().numpy())

In [None]:
# Compute F1 score
f1 = f1_score(val_true, val_predictions)
print(f"F1 Score: {f1}")

In [19]:
# Predict on test data
test_loader = DataLoader(test_dataset, batch_size=batch_size)
test_predictions = []
with torch.no_grad():
    for batch in test_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        
        # Compute predictions
        preds = torch.argmax(outputs.logits, dim=-1)
        test_predictions.extend(preds.cpu().numpy())

In [20]:
# Prepare submission
submission = pd.DataFrame({'id': test['id'], 'target': test_predictions})
submission.to_csv('my_submission.csv', index=False)