In [1]:
import pandas as pd
data = pd.read_csv("Disaster.csv")
data.columns

Index(['Name', 'UserName', 'Timestamp', 'Verified', 'Tweets', 'Comments',
       'Retweets', 'Likes', 'Impressions', 'Tags', 'Tweet Link', 'Tweet ID',
       'Disaster'],
      dtype='object')

In [2]:
texts = data['Tweets'].tolist()
labels = data['Disaster'].tolist()

In [3]:
data.Disaster.value_counts()

Disaster
Drought       770
Wildfire      540
Earthquake    500
Floods        436
Hurricanes    178
Tornadoes     135
Name: count, dtype: int64

In [4]:
# Create a mapping dictionary for disaster types
disaster_mapping = {
    'Drought': 0,
    'Wildfire': 1,
    'Earthquake': 2,
    'Floods': 3,
    'Hurricanes': 4,
    'Tornadoes': 5
}

# Apply the mapping to the Disaster column
data['Disaster'] = data['Disaster'].map(disaster_mapping)

In [5]:
data.Disaster.value_counts()

Disaster
0    770
1    540
2    500
3    436
4    178
5    135
Name: count, dtype: int64

In [6]:
import torch
from sklearn.model_selection import train_test_split
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

# Load the model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Data Preparation: Select 1000 random samples from the dataset
data = data.sample(1000, random_state=42 )

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Tweets'], data['Disaster'], test_size=0.2, random_state=42)

In [8]:
# Tokenize the texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

In [9]:
# Convert to torch tensors
class DisasterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = DisasterDataset(train_encodings, train_labels.tolist())
test_dataset = DisasterDataset(test_encodings, test_labels.tolist())

# Create a DataLoader
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)