### The aim of this notebook is to load, analyze and preprocess raw data  (clean if needed).

In [1]:
import pandas as pd

In [2]:
# Load raw data

In [3]:
raw_data = pd.read_json("../data/raw/Graduate - HEADLINES dataset (2019-06).json", lines=True)

In [4]:
raw_data.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
# Get basic information about data set

In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 2 columns):
headline        26709 non-null object
is_sarcastic    26709 non-null int64
dtypes: int64(1), object(1)
memory usage: 417.4+ KB


In [7]:
# Prepare data to tokenization

In [8]:
sentence = raw_data["headline"]
result = raw_data["is_sarcastic"]

In [9]:
# Break words into tokens, add CLS--start and SEP--end tokens, convert tokens into unique id

In [10]:
import torch
import transformers as ppb

In [11]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [12]:
tokenized = sentence.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [13]:
tokenized_data = pd.concat((tokenized, result), axis=1)

In [14]:
tokenized_data.to_json("../data/interim/Graduate - HEADLINES dataset (2019-06)_TOKENIZED.json", orient='records', lines=True)

In [15]:
# Create padded matrix with the same number of columns

In [16]:
import numpy as np

In [17]:
max_len = 0
for row in tokenized.values:
    if len(row) > max_len:
        max_len = len(row)

padded = np.array([row + [0] * (max_len - len(row)) for row in tokenized.values])

In [18]:
padded.shape

(26709, 66)

In [19]:
# Create data packages, this step is realized in google cloud instance

In [24]:
number_of_rows = padded.shape[0]

size_of_packages = 500
number_of_packages = round(number_of_rows / size_of_packages, 0) + 1
start = 0
end = 500

In [25]:
for pack in range(int(number_of_packages)):
    
    print("Epoch: {} / {}".format(pack+1, number_of_packages))
    
    if number_of_rows < size_of_packages:
        size_of_packages = number_of_rows
    
    padded_set = padded[start: end, :]
    labels_set = np.array(result[start: end]).reshape(-1, 1)
    
    input_ids = torch.tensor(np.array(padded_set))
    
    with torch.no_grad():
        last_hidden_states = model(input_ids)
        
    features = last_hidden_states[0][:,0,:].numpy()
    ready_pack = np.concatenate((features, labels_set), axis = 1)
    np.save("../data/interim/distilBERT_output/pack_{}.npy".format(pack), ready_pack)
    
    number_of_rows = number_of_rows - size_of_packages
    start += size_of_packages
    end += size_of_packages
    
    print(start, end)
    print(size_of_packages)

Epoch: 1 / 54.0
500 1000
500
Epoch: 2 / 54.0
1000 1500
500
Epoch: 3 / 54.0
1500 2000
500
Epoch: 4 / 54.0
2000 2500
500
Epoch: 5 / 54.0
2500 3000
500
Epoch: 6 / 54.0
3000 3500
500
Epoch: 7 / 54.0
3500 4000
500
Epoch: 8 / 54.0
4000 4500
500
Epoch: 9 / 54.0
4500 5000
500
Epoch: 10 / 54.0
5000 5500
500
Epoch: 11 / 54.0
5500 6000
500
Epoch: 12 / 54.0
6000 6500
500
Epoch: 13 / 54.0
6500 7000
500
Epoch: 14 / 54.0
7000 7500
500
Epoch: 15 / 54.0
7500 8000
500
Epoch: 16 / 54.0
8000 8500
500
Epoch: 17 / 54.0
8500 9000
500
Epoch: 18 / 54.0
9000 9500
500
Epoch: 19 / 54.0
9500 10000
500
Epoch: 20 / 54.0
10000 10500
500
Epoch: 21 / 54.0
10500 11000
500
Epoch: 22 / 54.0
11000 11500
500
Epoch: 23 / 54.0
11500 12000
500
Epoch: 24 / 54.0
12000 12500
500
Epoch: 25 / 54.0
12500 13000
500
Epoch: 26 / 54.0
13000 13500
500
Epoch: 27 / 54.0
13500 14000
500
Epoch: 28 / 54.0
14000 14500
500
Epoch: 29 / 54.0
14500 15000
500
Epoch: 30 / 54.0
15000 15500
500
Epoch: 31 / 54.0
15500 16000
500
Epoch: 32 / 54.0
16000 1