***Download and Preprocess Data***

In [1]:
import pandas as pd

In [None]:
df = pd.read_csv("https://github.com/laxmimerit/All-CSV-ML-Data-Files-Download/raw/refs/heads/master/twitter_disaster_tweets.csv", usecols=['text', 'target'])
df.head()

In [5]:
df = df.sample(frac=1).reset_index(drop=True)
print(df['target'].value_counts())
df.head()

target
0    4342
1    3271
Name: count, dtype: int64


Unnamed: 0,text,target
0,Serephina the Siren &lt;3 http://t.co/k6UEtsnLHT,0
1,LLF TALK WORLD NEWS U.S. in record hurricane ...,1
2,@XGN_Infinity @Ronin_Carbon HAHAH Mutual host ...,0
3,What's up man?,0
4,@Im2aD I was going to tell him but you were bo...,0


***Load Data with Hugging Face Datasets Library***

In [9]:
df = df.rename(columns={"target": "labels"})

In [10]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

In [11]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [13]:
id2label = {0: "general", 1: "disaster"}
label2id = {"general": 0, "disaster": 1}

***Data Tokenisation***

In [14]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokeniser = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)


Using device: cuda


In [16]:
def tokenize_function(batch):
    temp = tokeniser(batch['text'], padding=True, truncation=True, max_length=100)
    return temp

tokenised_dataset = dataset.map(tokenize_function, batched=True, batch_size=None)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

***Model Evaluation Function*** (From HuggingFace)

In [17]:
import evaluate
import numpy as np

accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=preds, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=len(label2id), id2label=id2label, label2id=label2id
).to(device)

In [20]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-5,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenised_dataset['train'],
    eval_dataset=tokenised_dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokeniser
)




In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.494748,0.786605
2,No log,0.475881,0.791202
3,0.506900,0.452591,0.805647
4,0.506900,0.441476,0.807617
5,0.506900,0.439691,0.807617


TrainOutput(global_step=955, training_loss=0.4653286918919748, metrics={'train_runtime': 484.5142, 'train_samples_per_second': 62.846, 'train_steps_per_second': 1.971, 'total_flos': 71633368245600.0, 'train_loss': 0.4653286918919748, 'epoch': 5.0})

In [22]:
trainer.evaluate()

{'eval_loss': 0.4396907687187195,
 'eval_accuracy': 0.8076165462902167,
 'eval_runtime': 4.0213,
 'eval_samples_per_second': 378.733,
 'eval_steps_per_second': 11.936,
 'epoch': 5.0}

***Model Save and Load for Inference***

In [23]:
trainer.save_model('tinybert_tweet_disaster_model')

***Upload Model to AWS S3***

In [24]:
import boto3

# Create an S3 client
s3 = boto3.client('s3')

bucket_name = "tinybert-imdb-model-bucket-7429"

import os

def upload_directory(directory_path, s3_prefix, bucket_name=bucket_name):
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace('\\', '/')
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace('\\', '/')
            s3.upload_file(file_path, bucket_name, s3_key)
            print(f"Uploaded '{file_path}' to '{s3_key}'")

upload_directory('tinybert_tweet_disaster_model', 'models/tinybert_tweet_disaster_model')

Uploaded 'tinybert_tweet_disaster_model/config.json' to 'models/tinybert_tweet_disaster_model/config.json'
Uploaded 'tinybert_tweet_disaster_model/model.safetensors' to 'models/tinybert_tweet_disaster_model/model.safetensors'
Uploaded 'tinybert_tweet_disaster_model/special_tokens_map.json' to 'models/tinybert_tweet_disaster_model/special_tokens_map.json'
Uploaded 'tinybert_tweet_disaster_model/tokenizer.json' to 'models/tinybert_tweet_disaster_model/tokenizer.json'
Uploaded 'tinybert_tweet_disaster_model/tokenizer_config.json' to 'models/tinybert_tweet_disaster_model/tokenizer_config.json'
Uploaded 'tinybert_tweet_disaster_model/training_args.bin' to 'models/tinybert_tweet_disaster_model/training_args.bin'
Uploaded 'tinybert_tweet_disaster_model/vocab.txt' to 'models/tinybert_tweet_disaster_model/vocab.txt'
