In [None]:
import datasets
from datasets import load_dataset_builder, load_dataset
import os

#### Step 1 : Load Dataset
- Simple dataset for sentiment analysis

In [19]:
sentiment_dataset = load_dataset("javalove93/sentiment-analysis-dataset")

Generating train split: 100%|██████████| 55/55 [00:00<00:00, 2053.01 examples/s]


In [40]:
import re
text = sentiment_dataset['train']['text'][0]

# Split by both '.' and '!'
sentences = re.split(r'[.!]', text)
sentences[:-1]

sentiment_dataset['train']['label']

['positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative']

**Matching label strategy**
- Split each sequence into single sentence.
- Regrouping sentences with same label to create pairs : focus on relationship between them !
- Here 2 sentiments : positive or negative
- Pairs should be list of tuples or list

In [None]:
from typing import Tuple # for cool decorations of functions
MAX_LEN = 64

# Splitting initial sequences
def create_sequences(dataset: datasets.Dataset) -> Tuple[list, list]:
    sequences = dataset['train']
    seq_pos = []
    seq_neg = []
    for sequence,label in zip(sequences['text'],sequences['label']):
        sentences = re.split(r'([.!])',sequence)
        sentences = [sentences[i] + sentences[i+1] for i in range(0, len(sentences)-1,2)]
        if len(sentences[-1]) == 0:
            sentences = [s[:MAX_LEN] for s in sentences[:-1]] # remove last sep empty
        if label == 'positive':
            seq_pos += sentences
        else:
            seq_neg += sentences
    return seq_pos, seq_neg

seq_pos, seq_neg = create_sequences(sentiment_dataset)

# Generating pairs
def generate_pairs(sequences: list) -> list:
    return [(s1,s2) for s1,s2 in zip(sequences[:-1],sequences[1:])]

pairs_pos = generate_pairs(seq_pos)
pairs_neg = generate_pairs(seq_neg)

sentences = seq_pos + seq_neg
pairs = pairs_pos + pairs_neg
pairs

[('I love this movie!', " It's amazing."),
 (" It's amazing.", 'What a great experience!'),
 ('What a great experience!', ' Highly recommended.'),
 (' Highly recommended.', "This is the best book I've ever read."),
 ("This is the best book I've ever read.", "I'm so happy with my purchase!"),
 ("I'm so happy with my purchase!", 'I had a fantastic time!'),
 ('I had a fantastic time!', 'Absolutely loved it!'),
 ('Absolutely loved it!', 'This is incredible!'),
 ('This is incredible!', "I'm very impressed with the performance."),
 ("I'm very impressed with the performance.", "I can't wait to try it again!"),
 ("I can't wait to try it again!", 'Excellent service and friendly staff.'),
 ('Excellent service and friendly staff.',
  'Highly satisfied with the results.'),
 ('Highly satisfied with the results.', 'This is a must-see!'),
 ('This is a must-see!', 'It was a wonderful evening.'),
 ('It was a wonderful evening.', 'I highly recommend this service.'),
 ('I highly recommend this service.',

### Step 2 : Tokenization
- Using WordPiece tokenizer to produce BERT inputs

In [None]:
import tqdm
from tokenizers import BertWordPieceTokenizer
from pathlib import Path
from transformers import BertTokenizer
# Creating batches
batch_size = 30


def create_batches(batch_size : int,sentences : list):
    text_data = []
    file_count = 0
    for word in tqdm.tqdm(sentences):

        text_data.append(word)

        if len(text_data) == batch_size:
            with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as file:
                file.write('\n'.join(text_data))
            text_data = []
            file_count += 1

    with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as file:
                file.write('\n'.join(text_data))

create_batches(batch_size,sentences)

paths = [str(x) for x in Path('./data').glob('**/*.txt')]

# Training the tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tokenizer.train( 
    files=paths,
    vocab_size=30_000, 
    min_frequency=5,
    limit_alphabet=1000, 
    wordpieces_prefix='##',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
    )

# os.mkdir('./bert-it-1')
tokenizer.save_model('./bert-it-1', 'bert-it')
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)

#### TODO : review tokenizer

100%|██████████| 59/59 [00:00<00:00, 36634.19it/s]







