In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 2.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 14.3MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 17.8MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K 

In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

### IMPORTANTE: correr el archivo Data Processing.ipynb que se encuentra en la carpeta Data/ antes de leer los .csv

In [2]:
train = pd.read_csv("Data/train_processed.csv")
test = pd.read_csv("Data/test_processed.csv")
sub_sample = pd.read_csv("Data/sample_submission.csv")

print (train.shape, test.shape, sub_sample.shape)

(7613, 22) (3263, 21) (3263, 2)


## BERT

Bidirectional Encoder Representations from Transformers, es una técnica de Natural Language Processing basada en pre-entrenar a un transformador para que entienda el contexto de las palabras para luego entrenar al modelo para realizar una predicción de clasificación.
Para este análisis vamos a usar DisilBERT que es una versión pre entrenada y de código abierto desarrollada por el equipo de HuggingFace.

In [3]:
y_train = train.target

model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')


# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [9]:
train['text'].values

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [5]:
# convierto el texto en un token
batch_1 = train['text'][:2000]
tokenized = batch_1.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))


In [7]:
# Cada tokenized es un array con un token por palabra. Ahora necesito que todas las filas tengan el mismo ancho (cantidad de columnas).
# Así que agrego padding
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
        
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

# Mask
Recién tomamos una parte del dataset para obtener tokens en base al texto agregamos paddings para que cada fila tenga la misma cantidad de columnas.
Ahora tenemos que decirle a BERT qué celdas del dataset son padding agregados por nosotros para que los omita en la predicción.



In [8]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 81)

In [9]:
attention_mask = torch.tensor(attention_mask)
input_ids = torch.tensor(padded)  

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [11]:
last_hidden_states

(tensor([[[-0.0078,  0.2434, -0.1924,  ..., -0.0423,  0.5865,  0.0078],
          [ 0.3320,  0.3346,  0.0207,  ..., -0.1569,  0.8014, -0.4104],
          [ 0.5480,  0.2635,  0.1367,  ...,  0.0184,  0.0848, -0.2652],
          ...,
          [-0.0550,  0.2640,  0.1535,  ...,  0.1169,  0.0369, -0.1629],
          [ 0.0152,  0.2166,  0.1225,  ...,  0.1965,  0.0519, -0.1487],
          [-0.0369,  0.2852,  0.1945,  ...,  0.0811,  0.0711, -0.1032]],
 
         [[-0.5075,  0.1034, -0.6453,  ..., -0.2546,  0.2126,  0.3391],
          [ 0.4142,  0.5571, -0.3625,  ..., -0.0183,  0.3380, -0.0851],
          [ 0.3048,  0.2969, -0.3315,  ..., -0.3804, -0.0696, -0.3818],
          ...,
          [ 0.0199,  0.1257, -0.2037,  ...,  0.1662, -0.1118,  0.4595],
          [-0.1041,  0.1329, -0.2229,  ..., -0.0524, -0.0426,  0.2601],
          [-0.0437,  0.0847, -0.2386,  ...,  0.0112, -0.1759,  0.4035]],
 
         [[-0.1455, -0.1657,  0.2994,  ..., -0.1828,  0.0257,  0.4130],
          [-0.1536,  0.0443,

In [13]:
features = last_hidden_states[0][:,0,:].numpy()
features

array([[-0.00779025,  0.24342766, -0.19241537, ..., -0.04230952,
         0.58648366,  0.00777953],
       [-0.507526  ,  0.10338616, -0.6452901 , ..., -0.2545608 ,
         0.21263906,  0.33910495],
       [-0.14549513, -0.16572241,  0.29940492, ..., -0.18275894,
         0.02567799,  0.41298452],
       ...,
       [-0.25057685, -0.11841887,  0.0774207 , ..., -0.05480941,
         0.15326285,  0.28441513],
       [ 0.00270303, -0.03092067, -0.00259606, ..., -0.02553035,
         0.22686933,  0.24538912],
       [-0.04196359,  0.12659155,  0.09899823, ..., -0.16992418,
         0.24939045,  0.476121  ]], dtype=float32)

In [16]:
labels = train['target'][:2000]
labels

0       1
1       1
2       1
3       1
4       1
       ..
1995    1
1996    1
1997    1
1998    0
1999    1
Name: target, Length: 2000, dtype: int64

In [18]:
lr_clf = LogisticRegression(max_iter=1500)
lr_clf.fit(features, labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
test.shape

(3263, 21)

In [None]:
def get_features_from_texts(text, tokenizer, max_len):
  tokenized = text.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))  # tokenize
  padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])  # padding
  attention_mask = np.where(padded != 0, 1, 0)  # mask para los paddings
  attention_mask = torch.tensor(attention_mask)
  input_ids = torch.tensor(padded)  


  with torch.no_grad():
      last_hidden_states = model(input_ids, attention_mask=attention_mask)  # obtengo los estados para obtener los features

  features = last_hidden_states[0][:,0,:].numpy()
  return features

test_features = get_features_from_texts(test['text'], tokenizer, max_len)

test_labels = lr_clf.predict(test_features)

In [24]:
pd.Series(test_labels).value_counts()

0    2023
1    1240
dtype: int64

In [32]:
# lr_clf.score(test_features, test_labels)

In [25]:
submit = sub_sample.copy()
submit.target = test_labels
submit.to_csv('submit_bert.csv',index=False)

In [31]:
predictions = lr_clf.predict(features)
# F-1 score
print ("Training set f1_score :", np.round(f1_score(train['target'][0:2000], predictions),5))

Training set f1_score : 0.82976


In [28]:
# Matriz de Confusión
pd.DataFrame(confusion_matrix(train['target'][0:2000], predictions))

Unnamed: 0,0,1
0,1182,64
1,174,580


# Entreno usando los datos que dejé afuera por RAM

In [33]:
features_batch2 = get_features_from_texts(train['text'][2000:4000], tokenizer, max_len)
labels_bacth2 = train['target'][2000:4000]
lr_clf.fit(features_batch2, labels_bacth2)

NameError: ignored

# Veo si mejoró


In [None]:
predictions = lr_clf.predict(features)
# F-1 score
print ("Training set f1_score :", np.round(f1_score(train['target'][0:2000], predictions),5))

# Entreno usando los datos que faltaban


In [None]:
features_batch3 = get_features_from_texts(train['text'][4000:], tokenizer, max_len)
labels_bacth3 = train['target'][4000:]
lr_clf.fit(features_batch3, labels_bacth3)

# Veo si mejoró


In [None]:
predictions = lr_clf.predict(features)
# F-1 score
print ("Training set f1_score :", np.round(f1_score(train['target'][0:2000], predictions),5))

# Exporto resultado final


In [None]:
test_labels = lr_clf.predict(test_features)
submit = sub_sample.copy()
submit.target = test_labels
submit.to_csv('submit_bert.csv',index=False)