<a href="https://colab.research.google.com/github/Gonzalo933/newtral-interview/blob/feature%2Fmodel/notebooks/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install dvc
!pip install transformers
!pip install mlflow==1.15.0



In [28]:
import numpy as np
import pandas as pd
import dvc.api
import bson
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import (
    StratifiedKFold
)
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
import transformers as ppb
import sklearn.metrics as sk_metrics
import mlflow

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
TRACKING_SERVER_IP = "79.116.1.145"
DATA_VERSION="data/v2"
METRICS_TO_REPORT = ["precision_score", "recall_score", "f1_score", "roc_auc_score", "average_precision_score"]
MODELS_TO_EXECUTE = [
                     DummyClassifier(),
                     LogisticRegression(),
                     RandomForestClassifier(),
]
MAX_ALLOWED_TOKENIZED_LEN = 10
BERT_NAME = "bert-base-uncased" # "bert-base-multilingual-cased"
LANGS = ["text_en"]
try:
    # With dvc
    with dvc.api.open(
        "data/processed/claims.csv",
        repo="https://github.com/Gonzalo933/newtral-interview",
        rev=DATA_VERSION,
        mode="rb",
    ) as fd:
        df_raw = pd.read_csv(fd)
except:
    # No dvc
    with open("/content/claims.csv", "rb") as fd:
        df_raw = pd.read_csv(fd)



In [30]:
df = pd.DataFrame(columns=["text", "claim"])
df["text"] = pd.concat([df_raw[col] for col in LANGS])
df["claim"] = pd.concat([df_raw["claim"] for _ in LANGS])

In [31]:
df

Unnamed: 0,text,claim
0,Thank you.,0
1,"For example, when we are talking about a coun...",0
2,So as I just think it's like the politics of ...,0
3,And I think the People's Party has not risen ...,0
4,The next thing We fly and I leave you with th...,0
...,...,...
14347,That agriculture can feed the citizens.,0
14348,We have to homologate with the other European...,0
14349,Do you say of the coalition government's init...,0
14350,We need to sit down the political forces that...,0


In [32]:
tokenizer = AutoTokenizer.from_pretrained(BERT_NAME)
model = AutoModel.from_pretrained(BERT_NAME,     num_labels = 2, # The number of output labels--2 for binary classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    )
model = model.to(device)

In [33]:
tokenized = df["text"].apply(
    (lambda x: tokenizer.encode(x, add_special_tokens=True))
)
labels = df["claim"]

In [34]:
tokenized

0                             [101, 4067, 2017, 1012, 102]
1        [101, 2005, 2742, 1010, 2043, 2057, 2024, 3331...
2        [101, 2061, 2004, 1045, 2074, 2228, 2009, 1005...
3        [101, 1998, 1045, 2228, 1996, 2111, 1005, 1055...
4        [101, 1996, 2279, 2518, 2057, 4875, 1998, 1045...
                               ...                        
14347    [101, 2008, 5237, 2064, 5438, 1996, 4480, 1012...
14348    [101, 2057, 2031, 2000, 24004, 21197, 3686, 20...
14349    [101, 2079, 2017, 2360, 1997, 1996, 6056, 2231...
14350    [101, 2057, 2342, 2000, 4133, 2091, 1996, 2576...
14351    [101, 1996, 7233, 4636, 1010, 1999, 2026, 3193...
Name: text, Length: 14352, dtype: object

In [35]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
print(padded.shape)
print(max_len)

(14352, 150)
150


In [36]:
padded = padded[:, :MAX_ALLOWED_TOKENIZED_LEN]
padded.shape

(14352, 10)

In [37]:
padded

array([[ 101, 4067, 2017, ...,    0,    0,    0],
       [ 101, 2005, 2742, ..., 3331, 2055, 1037],
       [ 101, 2061, 2004, ..., 1005, 1055, 2066],
       ...,
       [ 101, 2079, 2017, ..., 2231, 1005, 1055],
       [ 101, 2057, 2342, ..., 2576, 2749, 2008],
       [ 101, 1996, 7233, ..., 3193, 2012, 2647]])

In [38]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(14352, 10)

In [39]:
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [40]:
padded.shape

(14352, 10)

In [41]:
attention_mask.shape

(14352, 10)

In [42]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
labels = torch.tensor(labels)

In [45]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
batch_size = 32
# wrap tensors
train_data = TensorDataset(input_ids, attention_mask, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


for epoch_i in range(0, epochs):
    print(f"Epoch {epoch_i}")
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)
        model.zero_grad() 
        
        result = model(batch_input_ids, 
                            token_type_ids=None, 
                            attention_mask=batch_input_mask, 
                            return_dict=True)
        result.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

model.eval()
last_hidden_states = model(input_ids.to(device), attention_mask=attention_mask.to(device))

Epoch 0
> <ipython-input-45-57c5166e95f4>(35)<module>()
-> result = model(batch_input_ids,
(Pdb) c


AttributeError: ignored

In [None]:
X = last_hidden_states[0][:, 0, :].numpy()

In [None]:
def calculate_metrics_both_classes(metric_name, sk_metrics, y_true, y_pred):
    negative_class_metric_value = getattr(sk_metrics, metric_name)(
        y_true, y_pred, pos_label=0
    )
    positive_class_metric_value = getattr(sk_metrics, metric_name)(
        y_true, y_pred, pos_label=1
    )
    return {
        f"{metric_name}_0": negative_class_metric_value,
        f"{metric_name}_1": positive_class_metric_value,
    }


def calculate_metric_one_class(metric_name, sk_metrics, y_true, y_pred):
    return getattr(sk_metrics, metric_name)(y_true, y_pred)


def calculate_metrics(df_metrics, metrics_to_report, sk_metrics, y_true, y_pred, y_pred_proba):
    metrics = {}
    for metric_name in metrics_to_report:
        if "The class to report" in getattr(sk_metrics, metric_name).__doc__:
            # The metrics can be calculated for both classes
            metrics.update(calculate_metrics_both_classes(metric_name, sk_metrics, y_true, y_pred))
        else:
            metrics.update({metric_name: calculate_metric_one_class(metric_name, sk_metrics, y_true, y_pred)})
    return df_metrics.append(pd.Series(metrics), ignore_index=True)

def run_experiment(skf, clf):
    df_metrics = pd.DataFrame()
    for train_index, test_index in skf.split(X, labels):
        clf.fit(X[train_index], labels[train_index])

        y_pred = clf.predict(X[test_index])
        y_true = labels[test_index]
        y_pred_proba = clf.predict_proba(X[test_index])
    
        df_metrics = calculate_metrics(df_metrics, METRICS_TO_REPORT, sk_metrics, y_true, y_pred, y_pred_proba)
    return df_metrics

In [None]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_IP}:5000")
mlflow.set_experiment(DATA_VERSION)

skf = StratifiedKFold(n_splits=3)
clf = LogisticRegression()
for clf in MODELS_TO_EXECUTE:
    with mlflow.start_run(run_name=clf.__class__.__name__):
        mlflow.log_param("model", f"{clf.__module__}.{clf.__class__.__name__}")
        mlflow.log_param("model_params", clf.get_params())
        mlflow.log_param("MAX_ALLOWED_TOKENIZED_LEN", MAX_ALLOWED_TOKENIZED_LEN)
        mlflow.log_param("BERT_NAME", BERT_NAME)
        mlflow.log_param("LANGS", LANGS)
        
        df_metrics = run_experiment(skf, clf)
        # Report averages to mlflow
        print(df_metrics)
        for metric, value in df_metrics.mean().to_dict().items():
            mlflow.log_metric(metric, value)