In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [4]:
df.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [5]:
df.shape

(6920, 2)

In [6]:
#For performance reasons, we will work with a part of the dataset
df_batch1 = df[:2000]

In [7]:
df_batch1.shape

(2000, 2)

In [8]:
df_batch1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

In [9]:
#Load distillbert
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

In [10]:
#Load pre-trained model/tokenizzer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=546.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




### Preparing the dataset

In [11]:
#Tokenize the sentences -- break them up into word and subwords in the format BERT is comfortable with.
tokenized = df_batch1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [12]:
tokenized.head()

0    [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1    [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2    [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3    [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4    [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
Name: 0, dtype: object

In [13]:
tokenized.shape

(2000,)

In [14]:
#Padding
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

        
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    

In [15]:
np.array(padded).shape

(2000, 59)

In [16]:
#Masking
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

### Model #1

In [17]:
#The model() function runs our sentences through BERT. The results of the processing will be returned into last_hidden_states.
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [19]:
features = last_hidden_states[0][:,0,:].numpy()
labels = df_batch1[1]

In [20]:
#train-test split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



In [21]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.818


In [22]:
lr_clf = LogisticRegression(C=5.26325)
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=5.26325, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
lr_clf.score(test_features, test_labels)

0.824

In [24]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.511 (+/- 0.06)
