In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import numpy as np
from torch.utils.data import DataLoader

In [9]:
test_df = pd.read_csv('/Users/merlesteffen/Documents/GitHub/Disaster_Tweets_Classification/Data/test.csv')

In [10]:
train_df = pd.read_csv('/Users/merlesteffen/Documents/GitHub/Disaster_Tweets_Classification/Data/train.csv')

# Split Dataset Test/Train

In [11]:
df_train, df_test = train_test_split(train_df, test_size=0.2, random_state=42)

# BERT

## Tokenization

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
# Tokenize the text
train_encodings = tokenizer(df_train['text'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(df_test['text'].tolist(), truncation=True, padding=True, max_length=512)

## Creating Datasets

In [19]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [21]:
# Create datasets
train_dataset = TextDataset(train_encodings, df_train['target'].tolist())
test_dataset = TextDataset(test_encodings, df_test['target'].tolist())

## Fine-Tuning BERT

In [22]:
# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
)

In [24]:
# Create a Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
)


* 'schema_extra' has been renamed to 'json_schema_extra'


In [25]:
# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=2286, training_loss=0.38680574712477955, metrics={'train_runtime': 475.585, 'train_samples_per_second': 38.416, 'train_steps_per_second': 4.807, 'total_flos': 788654832890400.0, 'train_loss': 0.38680574712477955, 'epoch': 3.0})

# Make Predictions

In [None]:
# Make predictions
predictions = trainer.predict(test_dataset)

# Convert predictions to labels
pred_labels = np.argmax(predictions.predictions, axis=1)

# Evaluate Model

In [None]:
# Calculate accuracy
accuracy = accuracy_score(df_test['target'], pred_labels)
print(f"Accuracy: {accuracy}")

In [None]:
# Calculate F1 score
f1 = f1_score(df_test['target_column'], pred_labels)
print(f"F1 Score: {f1}")

# Train on entire Dataset (Train)

## Tokenization

In [27]:
# Tokenize the training text
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=512)

# Tokenize the test text
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=512)

## Convert to Tensors and Create Datasets

In [None]:
# Convert training data to tensors
train_seq = torch.tensor(train_encodings['input_ids'])
train_mask = torch.tensor(train_encodings['attention_mask'])
train_y = torch.tensor(train_df['target_column'].tolist())

# Convert test data to tensors
test_seq = torch.tensor(test_encodings['input_ids'])
test_mask = torch.tensor(test_encodings['attention_mask'])

# Create TensorDatasets for train and test
train_data = TensorDataset(train_seq, train_mask, train_y)
test_data = TensorDataset(test_seq, test_mask)

## Create DataLoader

In [None]:
# Define batch size
batch_size = 8

# Create DataLoaders
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)


## Train the model

In [None]:
# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    logging_dir='./logs',
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
)

In [None]:
# Train the model
trainer.train()

## Make Predictions

In [None]:
# Make predictions
predictions = trainer.predict(test_data)

# Convert predictions to labels
pred_labels = np.argmax(predictions.predictions, axis=1)

# Assuming you want to attach these predictions to your test_df
test_df['predicted_label'] = pred_labels

# Create Submission File

In [None]:
# Create the submission DataFrame
submission_df = test_df[['id', 'predicted_label']]

# Rename 'predicted_label' to 'target'
submission_df = submission_df.rename(columns={'predicted_label': 'target'})

# Save to CSV file
submission_df.to_csv('/Users/merlesteffen/Documents/GitHub/Disaster_Tweets_Classification/Data/submissions/submission.csv', index=False)