In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import re
import spacy

from collections import Counter, defaultdict
from transformers import AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec

from sklearn.manifold import TSNE

from tqdm import tqdm

# spacy.cli.download("en_core_web_lg")
nlp = spacy.load('en_core_web_lg')

### Basic Preprocessing

In [2]:
def read_csv(filename):

    return pd.read_csv(filename, sep=',', encoding='utf-8', index_col=0).iloc[:, -2:].reset_index(drop=True)


def preprocess(text):

    # remove URLs
    text = re.sub('http\S*', ' ', text)

    # remove non-alphabetic
    text = re.sub("[^a-zA-Z]", " ", text)

    # make lowercase
    text = text.lower()

    # remove one character word
    text = re.sub("\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub("^[a-zA-Z]\s+", '', text)

    # replace double space to one space
    text = re.sub("\s+", ' ', text)

    # tokenize, lemmatize, remove stop words
    doc = nlp(text)
    text = [token.lemma_ for token in doc if not token.is_stop]
    return " ".join(text)

In [7]:
tweets_train = read_csv('train.csv')

tweets_train['clean_text'] = tweets_train['text'].apply(preprocess)


In [10]:
tweets_train.head()


Unnamed: 0,text,target,clean_text
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order cal...
4,Just got sent this photo from Ruby #Alaska as ...,1,get send photo ruby alaska smoke wildfire pour...


In [12]:
class BPE():
    """Byte-Pair Encoding: Subword-based tokenization algorithm."""

    def __init__(self, corpus, vocab_size):
        """Initialize BPE tokenizer."""
        self.corpus = corpus
        self.vocab_size = vocab_size
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.word_freqs = defaultdict(int)
        self.splits = {}
        self.merges = {}

    def train(self):
        """Train BPE tokenizer."""
        self._compute_word_frequencies()
        self._compute_base_alphabet()
        self._initialize_vocab()

        while len(self.vocab) < self.vocab_size:
            pair_freqs = self._compute_pair_freqs()
            best_pair = max(pair_freqs, key=pair_freqs.get)
            self._merge_pair(*best_pair)
            self.merges[best_pair] = best_pair[0] + best_pair[1]
            self.vocab.append(best_pair[0] + best_pair[1])

        return self.merges

    def _compute_word_frequencies(self):
        """Compute the frequencies of each word in the corpus."""
        for text in self.corpus:
            words_with_offsets = self.tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
            new_words = [word for word, offset in words_with_offsets]
            for word in new_words:
                self.word_freqs[word] += 1

    def _compute_base_alphabet(self):
        """Compute the base vocabulary of all characters in the corpus."""
        alphabet = set()
        for word in self.word_freqs.keys():
            alphabet.update(set(word))
        self.alphabet = sorted(list(alphabet))

    def _initialize_vocab(self):
        """Initialize vocabulary with base characters and special token."""
        self.vocab = ["</w>"] + self.alphabet.copy()
        self.splits = {word: [c for c in word] for word in self.word_freqs.keys()}

    def _compute_pair_freqs(self):
        """Compute the frequency of each pair."""
        pair_freqs = defaultdict(int)
        for word, freq in self.word_freqs.items():
            subword_splits = self.splits[word]
            if len(subword_splits) == 1:
                continue
            for i in range(len(subword_splits) - 1):
                pair = (subword_splits[i], subword_splits[i + 1])
                pair_freqs[pair] += freq
        return pair_freqs

    def _merge_pair(self, a, b):
        """Merge the given pair."""
        for word in self.word_freqs:
            subword_splits = self.splits[word]
            if len(subword_splits) == 1:
                continue
            i = 0
            while i < len(subword_splits) - 1:
                if subword_splits[i] == a and subword_splits[i + 1] == b:
                    subword_splits = subword_splits[:i] + [a + b] + subword_splits[i + 2 :]
                else:
                    i += 1
            self.splits[word] = subword_splits

    def tokenize(self, text):
        """Tokenize a given text with trained BPE tokenizer."""
        pre_tokenize_result = self.tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
        pre_tokenized_text = [word for word, offset in pre_tokenize_result]
        subword_splits_text = [[l for l in word] for word in pre_tokenized_text]

        for pair, merge in self.merges.items():
            for idx, subword_splits in enumerate(subword_splits_text):
                i = 0
                while i < len(subword_splits) - 1:
                    if subword_splits[i] == pair[0] and subword_splits[i + 1] == pair[1]:
                        subword_splits = subword_splits[:i] + [merge] + subword_splits[i + 2 :]
                    else:
                        i += 1
                subword_splits_text[idx] = subword_splits

        result = sum(subword_splits_text, [])
        return result

In [13]:
# Instantiate BPE with your preprocessed text data and set vocabulary size
bpe_tokenizer = BPE(tweets_train['clean_text'], vocab_size=3000)  # You can adjust the vocab_size

# Train the BPE tokenizer
bpe_tokenizer.train()

# Apply the trained BPE tokenizer to your text
tweets_train['tokenized_text'] = tweets_train['clean_text'].apply(bpe_tokenizer.tokenize)

# Display the original and tokenized text
for idx, row in tweets_train[['clean_text', 'tokenized_text']].iterrows():
    print(f"Original Text: {row['clean_text']}")
    print(f"Tokenized Text: {row['tokenized_text']}")
    print("\n" + "="*50 + "\n")  # Just for better separation in output


Original Text: deed reason earthquake allah forgive
Tokenized Text: ['de', 'ed', 'reason', 'earthquake', 'allah', 'for', 'give']


Original Text: forest fire near la ronge sask canada
Tokenized Text: ['forest', 'fire', 'near', 'la', 'ron', 'ge', 's', 'ask', 'canada']


Original Text: resident ask shelter place notify officer evacuation shelter place order expect
Tokenized Text: ['resident', 'ask', 'shel', 'ter', 'place', 'not', 'ify', 'officer', 'evacuation', 'shel', 'ter', 'place', 'order', 'expect']


Original Text:   people receive wildfire evacuation order california
Tokenized Text: ['people', 're', 'ce', 'ive', 'wildfire', 'evacuation', 'order', 'california']


Original Text: get send photo ruby alaska smoke wildfire pour school
Tokenized Text: ['get', 'send', 'photo', 'ru', 'by', 'alaska', 'smoke', 'wildfire', 'po', 'ur', 'school']


Original Text:   rockyfire update california hwy close direction lake county fire cafire wildfire
Tokenized Text: ['rocky', 'fire', 'update', 'calif

In [16]:
tweets_train

Unnamed: 0,text,target,clean_text,tokenized_text
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive,"[de, ed, reason, earthquake, allah, for, give]"
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ron, ge, s, ask, canada]"
2,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...,"[resident, ask, shel, ter, place, not, ify, of..."
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order cal...,"[people, re, ce, ive, wildfire, evacuation, or..."
4,Just got sent this photo from Ruby #Alaska as ...,1,get send photo ruby alaska smoke wildfire pour...,"[get, send, photo, ru, by, alaska, smoke, wild..."
...,...,...,...,...
7608,Two giant cranes holding a bridge collapse int...,1,giant crane hold bridge collapse nearby home,"[giant, crane, hold, bridge, collapse, nearby,..."
7609,@aria_ahrary @TheTawniest The out of control w...,1,aria ahrary thetawniest control wild fire ca...,"[ar, ia, ah, r, ary, the, ta, w, ni, est, cont..."
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,utc km volcano hawaii,"[utc, km, volcano, hawaii]"
7611,Police investigating after an e-bike collided ...,1,police investigate bike collide car little por...,"[police, investigate, bi, ke, collide, car, li..."


In [14]:
def get_tfidf_matrix(df, vectorizer):

    # Convert the TF-IDF matrix to a dense NumPy array
    matrix = df.todense()

    # Convert the dense matrix to a DataFrame
    matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names_out())

    return matrix


# The sequences being in the formar ['word1', 'word2', 'word3', ...], preprocess it
def string2embedding_idx(text_sequence, model):

    sequence = []
    for token in text_sequence:
        try:
            sequence.append(model.wv.key_to_index[token])
        except:
            pass

    return sequence

In [15]:
class TweetsDataset(torch.utils.data.Dataset):
    def __init__(self, df, word2vec_model):
        self.df = df
        self.word2vec_model = word2vec_model

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sequence = self.df.iloc[idx, -1 if self.word2vec_model == 'skipgram' else -2]
        label = self.df.iloc[idx, 1]

        # Convert sequence to a 1D tensor
        sequence_tensor = torch.tensor(sequence, dtype=torch.long)

        # Convert label to a 1D tensor (scalar)
        label_tensor = torch.tensor(label, dtype=torch.float32).unsqueeze(0)

        return sequence_tensor, label_tensor