In [20]:
! pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp310-cp310-macosx_10_9_x86_64.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.1 threadpoolctl-3.1.0


In [21]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from sklearn.preprocessing import OneHotEncoder

In [4]:
#loading data in pandas dataframe

data_train = pd.read_json('./data/train.jsonl', lines=True)
data_valid = pd.read_json('./data/dev.jsonl', lines=True)
data_test = pd.read_json('./data/test.jsonl', lines=True)

In [5]:
data_train.head()

Unnamed: 0,text,label,label_text
0,"a stirring , funny and finally transporting re...",4,very positive
1,apparently reassembled from the cutting-room f...,1,negative
2,they presume their audience wo n't sit still f...,1,negative
3,the entire movie is filled with deja vu moments .,2,neutral
4,this is a visually stunning rumination on love...,3,positive


| Text Label   |      Label      |
|----------|:-------------:|
| very negative |  0 |
| negative|    1   |
| neutral | 2 |
| positive | 3 |
| very positive | 4 |

In [7]:
#Initializing the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 54.0MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 11.2kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 299kB/s]


In [8]:
seq_len = 512 # max length limit tokens of bert

In [12]:
def tokenize(dataset):
    tokens = tokenizer(data_train['text'].tolist(), max_length = seq_len, truncation=True, padding='max_length', add_special_tokens=True, return_tensors='np')
    return tokens

In [31]:
train_tokens = tokenize(data_train)
valid_tokens = tokenize(data_valid)
test_tokens = tokenize(data_test)
#Here tokens are seperated as {input_ids, token_id_types, attention_mask} 
#input_ids and attentions masks would be our features

In [34]:
NUM_CLASSES = 5

#One hot encode since we have more than 2 classes
def one_hot_encode(dataset):
    num_samples = len(dataset)
    labels = dataset['label']
    encoding = np.zeros((num_samples, NUM_CLASSES))
    
    for i in range(num_samples):
        label_value = labels[i]
        encoding[i][label_value] = 1
        
    return encoding
    
y_train = one_hot_encode(data_train)
y_valid = one_hot_encode(data_valid)
y_test = one_hot_encode(data_test)
print(y_train)


[[0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 ...
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]]


In [32]:
#Saving the preprocessed data

with open('./data/preprocessed/features/X_train_ids.npy', 'wb') as f:
    np.save(f, train_tokens['input_ids'])
    
with open('./data/preprocessed/features/X_valid_ids.npy', 'wb') as f:
    np.save(f, valid_tokens['input_ids'])
    
with open('./data/preprocessed/features/X_test_ids.npy', 'wb') as f:
    np.save(f, test_tokens['input_ids'])

In [33]:
with open('./data/preprocessed/features/X_train_mask.npy', 'wb') as f:
    np.save(f, train_tokens['attention_mask'])
    
with open('./data/preprocessed/features/X_valid_mask.npy', 'wb') as f:
    np.save(f, valid_tokens['attention_mask'])
    
with open('./data/preprocessed/features/X_test_mask.npy', 'wb') as f:
    np.save(f, test_tokens['attention_mask'])

In [29]:
with open('./data/preprocessed/labels/y_train.npy', 'wb') as f:
    np.save(f, y_train)
    
with open('./data/preprocessed/labels/y_valid.npy', 'wb') as f:
    np.save(f, y_valid)
    
with open('./data/preprocessed/labels/y_test.npy', 'wb') as f:
    np.save(f, y_test)