<a href="https://colab.research.google.com/github/GrizzlyToast/ML_Practise/blob/main/BERT_tokenizer_TwitterSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

In [None]:
! kaggle datasets download -d kazanova/sentiment140
! unzip -qq sentiment140.zip
! mv training.1600000.processed.noemoticon.csv train.csv

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 91% 74.0M/80.9M [00:00<00:00, 331MB/s]
100% 80.9M/80.9M [00:00<00:00, 335MB/s]


# Installing Dependencies

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

In [90]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers

from transformers import BertTokenizerFast, BertModel, DataCollatorWithPadding
from datasets import Dataset

# Data Preprocessing

In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv("train.csv",
                   header=None,
                   names=cols,
                   engine="python",
                   encoding="latin1")

In [None]:
data.drop(["id", "date", "query", "user"], axis=1, inplace=True)

In [None]:
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
def clean_tweet(tweet):
    """ Cleans text from tweets by removing tags, urls and special characters """
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    tweet = re.sub(r"https?://[A-za-z0-9./]+", ' ', tweet)
    tweet = re.sub(r"[^A-za-z.!?']", ' ', tweet)
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [84]:
data['text'] = data_clean
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1
data.head(5)

Unnamed: 0,labels,text
0,0,Awww that's a bummer. You shoulda got David C...
1,0,is upset that he can't update his Facebook by ...
2,0,I dived many times for the ball. Managed to s...
3,0,my whole body feels itchy and like its on fire
4,0,no it's not behaving at all. i'm mad. why am ...


In [85]:
data = data.rename(columns={'sentiment': 'labels'})
dataset = Dataset.from_pandas(data)
print(dataset)

Dataset({
    features: ['labels', 'text'],
    num_rows: 1600000
})


# Tokenization

In [92]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

print(model.config)
print(tokenizer.init_kwargs)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

{'do_lower_case': True, 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'tokenize_chinese_chars': True, 'strip_accents': None, 'model_max_length': 512, 'name_or_path': 'bert-base-uncased'}


In [None]:
# Tokenizer Example
print(sentence)
print(tokenizer(sentence))
print(tokenizer.convert_ids_to_tokens(tokenizer(sentence)['input_ids']))

 Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D
{'input_ids': [101, 22091, 2860, 2860, 2008, 1005, 1055, 1037, 26352, 5017, 1012, 2017, 2323, 2050, 2288, 2585, 12385, 1997, 2353, 2154, 2000, 2079, 2009, 1012, 1040, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'aw', '##w', '##w', 'that', "'", 's', 'a', 'bum', '##mer', '.', 'you', 'should', '##a', 'got', 'david', 'carr', 'of', 'third', 'day', 'to', 'do', 'it', '.', 'd', '[SEP]']


In [93]:
dataset = dataset.map(lambda x: tokenizer(x['text'], truncation=True), batched=True)

Map:   0%|          | 0/1600000 [00:00<?, ? examples/s]

In [94]:
all_batched = dataset.to_tf_dataset(
    columns=["input_ids", "labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

In [95]:
next(iter(all_batched))

{'labels': <tf.Tensor: shape=(32,), dtype=int64, numpy=
 array([1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 1, 0, 1, 1, 1])>,
 'input_ids': <tf.Tensor: shape=(32, 37), dtype=int64, numpy=
 array([[  101,  1998,  3393, ...,     0,     0,     0],
        [  101,  1045,  3246, ...,     0,     0,     0],
        [  101, 14916,  2666, ...,     0,     0,     0],
        ...,
        [  101,  6343,  2323, ...,     0,     0,     0],
        [  101,  2204,  2851, ...,     0,     0,     0],
        [  101, 10166,  2074, ...,     0,     0,     0]])>}