# Twitter Dataset

## import libraries

In [1]:
import torch
import numpy as np
import pandas as pd
import spacy
from torch.utils.data import Dataset
from transformers import EvalPrediction, Trainer, TrainingArguments, DistilBertTokenizerFast
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# Load the tokenizer
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


In [2]:
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = self.preprocess_text(texts)
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Check if we have labels
        if self.labels is not None:
            label = self.labels[idx]
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long),
            }
        else:  # Return only inputs for test data
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
            }


    def preprocess_text(self, texts):
        preprocessed_texts = []
        for text in texts:
            # Apply SpaCy pipeline on the text
            doc = nlp(text)
            # Lemmatize the text and join the words back into a single string
            lemma_text = " ".join([token.lemma_ for token in doc])
            preprocessed_texts.append(lemma_text)
        return preprocessed_texts


def compute_metrics(eval_pred: EvalPrediction):
    labels = eval_pred.label_ids
    preds = np.argmax(eval_pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


## Load The Data

In [3]:
# Load the CSV data
train_df = pd.read_csv('../data/twitter/train.csv')
test_df = pd.read_csv('../data/twitter/test.csv')
print(train_df.head(),'\n\n', test_df.head())

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN  Ruby AK  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1   

    id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
# Handle missing values by filling with a placeholder value
train_df = train_df.fillna('None')
test_df = test_df.fillna('None')
print(train_df.head(),'\n\n', test_df.head())

   id keyword location                                               text  \
0   1    None     None  Our Deeds are the Reason of this #earthquake M...   
1   4    None     None             Forest fire near La Ronge Sask. Canada   
2   5    None     None  All residents asked to 'shelter in place' are ...   
3   6    None     None  13,000 people receive #wildfires evacuation or...   
4   7    None  Ruby AK  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1   

    id keyword location                                               text
0   0    None     None                 Just happened a terrible car crash
1   2    None     None  Heard about #earthquake is different cities, s...
2   3    None     None  there is a forest fire at spot pond, geese are...
3   9    None     None           Apocalypse lighting. #Spokane #wildfires
4  11    None     None      Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
# Combine train and test data
combined_df = pd.concat([train_df, test_df])
print(combined_df.head())

   id keyword location                                               text  \
0   1    None     None  Our Deeds are the Reason of this #earthquake M...   
1   4    None     None             Forest fire near La Ronge Sask. Canada   
2   5    None     None  All residents asked to 'shelter in place' are ...   
3   6    None     None  13,000 people receive #wildfires evacuation or...   
4   7    None  Ruby AK  Just got sent this photo from Ruby #Alaska as ...   

   target  
0     1.0  
1     1.0  
2     1.0  
3     1.0  
4     1.0  


In [6]:
# One-hot encode the 'keyword' field
combined_df = pd.get_dummies(combined_df, columns=['keyword'])
print(combined_df.head())

   id location                                               text  target  \
0   1     None  Our Deeds are the Reason of this #earthquake M...     1.0   
1   4     None             Forest fire near La Ronge Sask. Canada     1.0   
2   5     None  All residents asked to 'shelter in place' are ...     1.0   
3   6     None  13,000 people receive #wildfires evacuation or...     1.0   
4   7  Ruby AK  Just got sent this photo from Ruby #Alaska as ...     1.0   

   keyword_None  keyword_ablaze  keyword_accident  keyword_aftershock  \
0          True           False             False               False   
1          True           False             False               False   
2          True           False             False               False   
3          True           False             False               False   
4          True           False             False               False   

   keyword_airplane%20accident  keyword_ambulance  ...  keyword_weapons  \
0                      

In [7]:
# Split back into train and test data
train_df = combined_df[:len(train_df)]
test_df = combined_df[len(train_df):]
print(train_df.head(),'\n\n', test_df.head())

   id location                                               text  target  \
0   1     None  Our Deeds are the Reason of this #earthquake M...     1.0   
1   4     None             Forest fire near La Ronge Sask. Canada     1.0   
2   5     None  All residents asked to 'shelter in place' are ...     1.0   
3   6     None  13,000 people receive #wildfires evacuation or...     1.0   
4   7  Ruby AK  Just got sent this photo from Ruby #Alaska as ...     1.0   

   keyword_None  keyword_ablaze  keyword_accident  keyword_aftershock  \
0          True           False             False               False   
1          True           False             False               False   
2          True           False             False               False   
3          True           False             False               False   
4          True           False             False               False   

   keyword_airplane%20accident  keyword_ambulance  ...  keyword_weapons  \
0                      

In [8]:
# train_df = train_df.copy()
# test_df = test_df.copy()
train_df.loc[:, 'input'] = train_df.apply(lambda row: f"{row['text']}", axis=1)
test_df.loc[:, 'input'] = test_df.apply(lambda row: f"{row['text']}", axis=1)

print(train_df.head(),'\n\n', test_df.head())

   id location                                               text  target  \
0   1     None  Our Deeds are the Reason of this #earthquake M...     1.0   
1   4     None             Forest fire near La Ronge Sask. Canada     1.0   
2   5     None  All residents asked to 'shelter in place' are ...     1.0   
3   6     None  13,000 people receive #wildfires evacuation or...     1.0   
4   7  Ruby AK  Just got sent this photo from Ruby #Alaska as ...     1.0   

   keyword_None  keyword_ablaze  keyword_accident  keyword_aftershock  \
0          True           False             False               False   
1          True           False             False               False   
2          True           False             False               False   
3          True           False             False               False   
4          True           False             False               False   

   keyword_airplane%20accident  keyword_ambulance  ...  keyword_whirlwind  \
0                    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:, 'input'] = train_df.apply(lambda row: f"{row['text']}", axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:, 'input'] = test_df.apply(lambda row: f"{row['text']}", axis=1)


In [9]:
# Extract 'input' as input and 'target' as labels from the train data
train_texts = train_df['input'].tolist()
train_labels = train_df['target'].tolist()
print(train_texts[:5],'\n\n', train_labels[:5])


['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', 'Forest fire near La Ronge Sask. Canada', "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected", '13,000 people receive #wildfires evacuation orders in California ', 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '] 

 [1.0, 1.0, 1.0, 1.0, 1.0]


In [10]:
# For the test data, we only need the 'input' column
test_texts = test_df['input'].tolist()
print(test_texts[:5],'\n\n')


['Just happened a terrible car crash', 'Heard about #earthquake is different cities, stay safe everyone.', 'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all', 'Apocalypse lighting. #Spokane #wildfires', 'Typhoon Soudelor kills 28 in China and Taiwan'] 




In [11]:
# Split the train data into train and validation subsets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

# Print the sizes of the training and validation subsets
print(f"Training set size: {len(train_texts)}")
print(f"Validation set size: {len(val_texts)}")


Training set size: 6090
Validation set size: 1523


In [12]:
# Initialize the datasets
train_dataset = TweetDataset(train_texts, train_labels, tokenizer)
val_dataset = TweetDataset(val_texts, val_labels, tokenizer)
test_dataset = TweetDataset(test_texts, None, tokenizer)  # No labels for the test data


## Load Bert

In [20]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # num_labels=2 for binary classification


### Split the data into training and validation sets


In [27]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)


In [1]:
from transformers import Trainer, TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir='../models/results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    evaluation_strategy='epoch',     # evaluation is performed at the end of each epoch
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics      # function to compute metrics
)

# Train the model
trainer.train()


KeyboardInterrupt: 

In [25]:
# from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir='../models/results',  # output directory
#     num_train_epochs=3,              # total number of training epochs
#     per_device_train_batch_size=16,  # batch size per device during training
#     per_device_eval_batch_size=64,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='../models/logs',    # directory for storing logs
#     learning_rate=1e-5,              # learning rate
# )

# trainer = Trainer(
#     model=model,                     # the instantiated 🤗 Transformers model to be trained
#     args=training_args,              # training arguments, defined above
#     train_dataset=train_dataset,     # training dataset
#     eval_dataset=val_dataset,        # evaluation dataset
# )


In [22]:
# training_args = TrainingArguments(
#     output_dir='../models/results',
#     num_train_epochs=3,  # reduce from 5 to 3
#     per_device_train_batch_size=32,  # increase from 16 to 32
#     gradient_accumulation_steps=2,  # new line for gradient accumulation
#     per_device_eval_batch_size=64,
#     warmup_steps=500,
#     weight_decay=0.01,
#     learning_rate=3e-5,  # increase from 2e-5 to 3e-5
#     logging_dir='../models/logs',
#     logging_steps=10,  # Log every 10 steps
#     evaluation_strategy="epoch",  # Evaluation and Save happens at every epoch
#     save_strategy="epoch",  # Save the model after every epoch
#     load_best_model_at_end=True,
#     #fp16=True,  # new line for mixed precision training
# )

# # Initialize the Trainer with the correct datasets
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,  # Use validation dataset for evaluation
#     compute_metrics=compute_metrics,
# )

In [None]:
# trainer.train()


Plot confusion matrix

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

conf_matrix = confusion_matrix(train_texts, train_preds, labels=[0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
                              display_labels=[0, 1])
disp.plot()

plt.show()

### Submission Output to CSV file


In [None]:
# Making predictions on test data
test_predictions = trainer.predict(test_dataset)

# We take the output class with the highest probability
test_preds = np.argmax(test_predictions.predictions, axis=1)

# Prepare a DataFrame with test IDs and predictions
submission_df = pd.DataFrame({
    'id': test_df['id'],  # Assuming that 'test_df' is the DataFrame with your test data
    'target': test_preds
})

# Save the DataFrame into a CSV file
submission_df.to_csv('submission.csv', index=False)


In the above script, `compute_metrics` is a function that calculates precision, recall, f1, and accuracy. This function is then passed to the `Trainer`, which uses it to compute these metrics after each evaluation.

`TrainingArguments` is initialized with parameters that dictate when logging and saving should occur. `logging_steps=10` means that the training loss is logged every 10 steps, and `evaluation_strategy="epoch"` and `save_strategy="epoch"` mean that an evaluation and a model save are performed after every epoch, respectively.

By setting `load_best_model_at_end=True`, the `Trainer` will automatically load the best model in terms of evaluation loss when the training is finished.

This assumes you want to perform evaluation and save the model after every epoch and log every 10 steps, but you might want to adjust these values depending on the size of your dataset and the resources you have available.




Hyperparameters in a machine learning model are parameters whose values are set before the learning process begins and they play a crucial role in the performance of the model. 

In the context of the Hugging Face's `Trainer`, here are some important hyperparameters and how to set them:

1. **Learning Rate**: This is one of the most important hyperparameters. If it's too high, training may diverge; if it's too low, training may be too slow or get stuck in a poor local minimum. You can set it in `TrainingArguments` with the `learning_rate` argument.

2. **Batch Size**: This is the number of samples that will be propagated through the network at once. A larger batch size allows you to leverage hardware optimizations and train your model faster, but it might also result in worse generalization performance. You can set it in `TrainingArguments` with the `per_device_train_batch_size` and `per_device_eval_batch_size` arguments.

3. **Number of Epochs**: This is the number of times the entire dataset is passed forward and backward through the network. You can set it in `TrainingArguments` with the `num_train_epochs` argument.

4. **Weight Decay**: This is a regularization technique that helps prevent your model from overfitting. You can set it in `TrainingArguments` with the `weight_decay` argument.

Here's an example of setting these hyperparameters:

```python
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # from 3 to 5
    per_device_train_batch_size=32,  # from 16 to 32
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,  # added this line to set learning rate
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
```

In this example, the number of epochs is increased to 5, the training batch size is increased to 32, and the learning rate is set to 2e-5. 

Hyperparameter tuning can be a complex process and usually involves trial and error. It might be beneficial to perform a systematic hyperparameter search or optimization using tools like Optuna or Ray Tune, especially for larger and more complex models and datasets.