In [31]:
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm

In [32]:
# Replace 'training.csv' with the location of your training file
# Read in the dataset into a Pandas DataFrame
label_text = pd.read_csv('../data/sentiment_training_data/training.1600000.processed.noemoticon.csv',
                   sep=',',
                   header=None,
                   encoding='latin')

label_text = label_text[[0, 5]]

# Assign proper column names to labels
label_text.columns = ['label', 'text']

# Convert labels to range 0-1                                        
label_text['label'] = [0 if x==0 else 1 for x in label_text['label']]

# Assign proper column names to labels
label_text.head()

Unnamed: 0,label,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [33]:
import re

hashtags = re.compile(r"^#\S+|\s#\S+")
mentions = re.compile(r"^@\S+|\s@\S+")
urls = re.compile(r"https?://\S+")

def process_text(text):
    text = hashtags.sub(' hashtag', text)
    text = mentions.sub(' entity', text)
    return text.strip().lower()
  
def match_expr(pattern, string):
    return not pattern.search(string) == None

def get_data_wo_urls(dataset):
    link_with_urls = dataset.text.apply(lambda x: match_expr(urls, x))
    return dataset[[not e for e in link_with_urls]]

In [68]:
label_text['text'] = [process_text(elem) for elem in tqdm(label_text['text'])]

HBox(children=(IntProgress(value=0, max=1600000), HTML(value='')))




In [69]:
label_text

Unnamed: 0,label,text
0,0,"entity http://twitpic.com/2y1zl - awww, that's..."
1,0,is upset that he can't update his facebook by ...
2,0,entity i dived many times for the ball. manage...
3,0,my whole body feels itchy and like its on fire
4,0,"entity no, it's not behaving at all. i'm mad. ..."
...,...,...
1599995,1,just woke up. having no school is the best fee...
1599996,1,thewdb.com - very cool to hear old walt interv...
1599997,1,are you ready for your mojo makeover? ask me f...
1599998,1,happy 38th birthday to my boo of alll time!!! ...


In [34]:
from sklearn.model_selection import train_test_split
TRAIN_SIZE = 0.75
VAL_SIZE = 0.05
dataset_count = len(label_text)

df_train_val, df_test = train_test_split(label_text, test_size=1-TRAIN_SIZE-VAL_SIZE, random_state=42)
df_train, df_val = train_test_split(df_train_val, test_size=VAL_SIZE / (VAL_SIZE + TRAIN_SIZE), random_state=42)

print("TRAIN size:", len(df_train))
print("VAL size:", len(df_val))
print("TEST size:", len(df_test))

TRAIN size: 1200000
VAL size: 80000
TEST size: 320000


In [35]:
# df_train = get_data_wo_urls(df_train)
# df_train.head()

In [36]:
df_train = df_train.reset_index(drop=True)

In [37]:
df_train['id'] = df_train.index

In [38]:
df_train['alpha'] = 'a'

In [39]:
df_train = df_train[['id', 'label', 'alpha', 'text']]

In [40]:
df_train.sample(frac=1.0).reset_index(drop=True).to_csv('../data/sentiment_training_data/train.tsv', sep='\t', index=None, header=None)
df_val.to_csv('../data/sentiment_training_data/dev.tsv', sep='\t', index=None, header=None)
df_test.to_csv('../data/sentiment_training_data/test.tsv', sep='\t', index=None, header=None)

In [23]:
from __future__ import absolute_import, division, print_function

import csv
import os
import sys
import logging

logger = logging.getLogger()
csv.field_size_limit(2147483647) # Increase CSV reader's field limit incase we have long text.


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines


class BinaryClassificationProcessor(DataProcessor):
    """Processor for binary classification dataset."""

    def get_train_examples(self, data_dir):
        """See base class."""
        df = pd.read_csv(os.path.join(data_dir, "train.tsv"), sep='\t')
#         return self._create_examples(
#             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            label = line[1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

In [55]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


def convert_example_to_feature(example_row):
    # return example_row
    example, label_map, max_seq_length, tokenizer, output_mode = example_row

    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]

    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_ids = [0] * len(tokens)

    if tokens_b:
        tokens += tokens_b + ["[SEP]"]
        segment_ids += [1] * (len(tokens_b) + 1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if output_mode == "classification":
        label_id = label_map[example.label]
    elif output_mode == "regression":
        label_id = float(example.label)
    else:
        raise KeyError(output_mode)

    return InputFeatures(input_ids=input_ids,
                         input_mask=input_mask,
                         segment_ids=segment_ids,
                         label_id=label_id)

In [4]:
import torch
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from tqdm import tqdm_notebook, trange
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

from multiprocessing import Pool, cpu_count
from tools import *

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
device

device(type='cpu')

In [10]:
# The input data dir. Should contain the .tsv files (or other data files) for the task.
DATA_DIR = "../data/sentiment_training_data/"

# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = 'bert-base-cased'

# The name of the task to train.I'm going to name this 'yelp'.
TASK_NAME = 'yelp'

# The output directory where the fine-tuned model and checkpoints will be written.
OUTPUT_DIR = f'outputs/{TASK_NAME}/'

# The directory where the evaluation reports will be written to.
REPORTS_DIR = f'reports/{TASK_NAME}_evaluation_report/'

# This is where BERT will look for pre-trained models to load parameters from.
CACHE_DIR = 'cache/'

# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
MAX_SEQ_LENGTH = 128

TRAIN_BATCH_SIZE = 24
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
OUTPUT_MODE = 'classification'

CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"

In [11]:
output_mode = OUTPUT_MODE

cache_dir = CACHE_DIR

In [12]:
if os.path.exists(REPORTS_DIR) and os.listdir(REPORTS_DIR):
        REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
        os.makedirs(REPORTS_DIR)
if not os.path.exists(REPORTS_DIR):
    os.makedirs(REPORTS_DIR)
    REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
    os.makedirs(REPORTS_DIR)


In [13]:
if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(OUTPUT_DIR))
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [25]:
processor = BinaryClassificationProcessor()
# train_examples = processor.get_train_examples(DATA_DIR)

In [26]:
import pandas as pd

In [45]:
df_train.head()

Unnamed: 0,id,label,alpha,text
0,0,0,a,World hunger hits one billion people. So Sad ...
1,1,0,a,@KendraWilkinson I can't watch it until next w...
2,2,1,a,just woke up
3,3,1,a,"In the garden with soph, fi, craig, and sarah ..."
4,4,1,a,@ngeeling can't wait to see your new hairstyle...


In [44]:
label_list = df_train['label'].values

In [46]:
train_examples = df_train['text'].values

In [48]:
train_examples_len = len(train_examples)
num_labels = len(label_list)

In [49]:
num_train_optimization_steps = int(
    train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS

In [50]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

100%|██████████| 213450/213450 [00:00<00:00, 775724.90B/s]


In [51]:
label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in train_examples]

In [57]:
process_count = cpu_count() - 1
if __name__ ==  '__main__':
    print(f'Preparing to convert {train_examples_len} examples..')
    print(f'Spawning {process_count} processes..')
    with Pool(process_count) as p:
        train_features = list(tqdm_notebook(p.imap(convert_example_to_feature, train_examples_for_processing), total=train_examples_len))

Preparing to convert 1200000 examples..
Spawning 7 processes..


HBox(children=(IntProgress(value=0, max=1200000), HTML(value='')))

AttributeError: 'str' object has no attribute 'text_a'

In [None]:
convert_example_to_feature

In [30]:
train_examples.head()

Unnamed: 0,0,1
0,0,vacation part of summer = over for now.
1,1,entity ha! nice.... skippy works too
2,0,entity glad to see you're keeping up jk when t...
3,0,friggin luck for force india. just when i tht ...
4,1,entity a good game is one where i can't walk f...


In [21]:
!ls ../data/sentiment_training_data/

dev.tsv
test.tsv
train.tsv
training.1600000.processed.noemoticon.csv


IndexError: list index out of range

In [None]:
train_examples_len = len(train_examples)

In [73]:
from BertLibrary import BertFTModel

ModuleNotFoundError: No module named 'BertLibrary'

In [37]:
train.head()

Unnamed: 0,text,label
1,is upset that he can't update his Facebook by ...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
5,@Kwesidei not the whole crew,0
7,@LOLTrish hey long time no see! Yes.. Rains a...,0


In [38]:
train['label'].value_counts()

0    560651
4    559144
Name: label, dtype: int64

In [41]:
train.shape

(1119795, 2)

In [40]:
valid

Unnamed: 0,text,label


In [39]:
valid['label'].value_counts()

Series([], Name: label, dtype: int64)

In [24]:
train.tail()

Unnamed: 0,text,label
1119995,"Just come back from my sis' BBQ, fed, watered ...",4
1119996,Peace Twittizens...enjoy the rest of ur day......,4
1119997,@kaps69 Yes - I want to lick Su-Bo's muff!,4
1119998,@anniegxxx yeah theres not a word to describe...,4
1119999,Summer help - Hire a Teenager - Start your sea...,4


## BERT

In [44]:
from pytorch_pretrained_bert.tokenization import BertTokenizer

In [42]:
args = {
    "max_seq_length": 512,
    "do_lower_case": True,
    "train_batch_size": 8,
    "learning_rate": 6e-5,
    "num_train_epochs": 12.0,
    "warmup_proportion": 0.002,
    "local_rank": -1,
    "gradient_accumulation_steps": 1,
    "fp16": True,
    "loss_scale": 128
}

In [45]:
tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAINED_PATH,
                                         do_lower_case=args['do_lower_case'])

NameError: name 'BERT_PRETRAINED_PATH' is not defined

In [29]:
import torch
# import apex
from pytorch_pretrained_bert.tokenization import BertTokenizer
from fast_bert.data import BertDataBunch
from fast_bert.learner import BertLearner
from fast_bert.metrics import accuracy

bert_model = 'bert-base-uncased'

TypeError: Class advice impossible in Python3.  Use the @implementer class decorator instead.