<a href="https://colab.research.google.com/github/Gonzalo933/newtral-interview/blob/feature%2Fmodel/notebooks/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install dvc
!pip install transformers
!pip install mlflow==1.15.0

In [2]:
import numpy as np
import pandas as pd
import dvc.api
import bson
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import (
    StratifiedKFold
)
import torch
import transformers as ppb
import sklearn.metrics as sk_metrics
import mlflow

In [3]:
TRACKING_SERVER_IP = "86.127.251.86"
DATA_VERSION="data/v2"
METRICS_TO_REPORT = ["precision_score", "recall_score", "f1_score", "roc_auc_score", "average_precision_score"]
MODELS_TO_EXECUTE = [
                     DummyClassifier(),
                     LogisticRegression(),
                     RandomForestClassifier(),
]
MAX_ALLOWED_TOKENIZED_LEN = 10
try:
    with dvc.api.open(
        "data/processed/claims.csv",
        repo="https://github.com/Gonzalo933/newtral-interview",
        rev=DATA_VERSION,
        mode="rb",
    ) as fd:
        #data = bson.decode_all(fd.read())
        #df = pd.DataFrame(data)
        df = pd.read_csv(fd)
except:
    with open("/content/claims.csv", "rb") as fd:
        #data = bson.decode_all(fd.read())
        #df = pd.DataFrame(data)
        df = pd.read_csv(fd)



In [4]:
df

Unnamed: 0,_id,text_es,text_en,text_fr,claim
0,5f80940cf95f926ca81a3751,Gracias.,Thank you.,Merci.,0
1,5f80940cf95f926ca81a3752,"Por ejemplo, cuando estamos hablando de un pa...","For example, when we are talking about a coun...","Par exemple, quand on parle d’un pays qui doi...",0
2,5f80940cf95f926ca81a3753,Entonces como solo creo que es como la políti...,So as I just think it's like the politics of ...,"Donc, comme je pense que c’est comme la polit...",0
3,5f80940cf95f926ca81a3754,Y pienso que el Partido Popular no ha estado ...,And I think the People's Party has not risen ...,Et je pense que le Parti populaire n’a pas ét...,0
4,5f80940cf95f926ca81a3755,Lo siguiente Nos vamos ya volando y les dejo ...,The next thing We fly and I leave you with th...,La prochaine chose que nous volons et je vous...,0
...,...,...,...,...,...
14347,5f80940df95f926ca81a6f5d,Que la agricultura pueda alimentar a los ciud...,That agriculture can feed the citizens.,Que l’agriculture peut nourrir les citoyens.,0
14348,5f80940df95f926ca81a6f5e,Hay que homologar con el resto de países euro...,We have to homologate with the other European...,Nous devons nous homologuer avec les autres p...,0
14349,5f80940df95f926ca81a6f5f,Dicen ustedes del programa inicial del gobier...,Do you say of the coalition government's init...,Dites-vous du programme initial du gouverneme...,0
14350,5f80940df95f926ca81a6f60,Hace falta sentarse las fuerzas políticas que...,We need to sit down the political forces that...,Nous devons asseoir les forces politiques que...,0


In [5]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (
    ppb.DistilBertModel,
    ppb.DistilBertTokenizer,
    "distilbert-base-uncased",
)

## Want BERT instead of distilBERT? Uncomment the following line:
# model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [6]:
tokenized = df["text_en"].apply(
    (lambda x: tokenizer.encode(x, add_special_tokens=True))
)
labels = df["claim"]

In [7]:
tokenized

0                             [101, 4067, 2017, 1012, 102]
1        [101, 2005, 2742, 1010, 2043, 2057, 2024, 3331...
2        [101, 2061, 2004, 1045, 2074, 2228, 2009, 1005...
3        [101, 1998, 1045, 2228, 1996, 2111, 1005, 1055...
4        [101, 1996, 2279, 2518, 2057, 4875, 1998, 1045...
                               ...                        
14347    [101, 2008, 5237, 2064, 5438, 1996, 4480, 1012...
14348    [101, 2057, 2031, 2000, 24004, 21197, 3686, 20...
14349    [101, 2079, 2017, 2360, 1997, 1996, 6056, 2231...
14350    [101, 2057, 2342, 2000, 4133, 2091, 1996, 2576...
14351    [101, 1996, 7233, 4636, 1010, 1999, 2026, 3193...
Name: text_en, Length: 14352, dtype: object

In [8]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
padded.shape

(14352, 150)

In [9]:
padded = padded[:, :MAX_ALLOWED_TOKENIZED_LEN]
padded.shape

(14352, 10)

In [10]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(14352, 10)

In [11]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [12]:
X = last_hidden_states[0][:, 0, :].numpy()

In [13]:
def calculate_metrics_both_classes(metric_name, sk_metrics, y_true, y_pred):
    negative_class_metric_value = getattr(sk_metrics, metric_name)(
        y_true, y_pred, pos_label=0
    )
    positive_class_metric_value = getattr(sk_metrics, metric_name)(
        y_true, y_pred, pos_label=1
    )
    return {
        f"{metric_name}_0": negative_class_metric_value,
        f"{metric_name}_1": positive_class_metric_value,
    }


def calculate_metric_one_class(metric_name, sk_metrics, y_true, y_pred):
    return getattr(sk_metrics, metric_name)(y_true, y_pred)


def calculate_metrics(df_metrics, metrics_to_report, sk_metrics, y_true, y_pred, y_pred_proba):
    metrics = {}
    for metric_name in metrics_to_report:
        if "The class to report" in getattr(sk_metrics, metric_name).__doc__:
            # The metrics can be calculated for both classes
            metrics.update(calculate_metrics_both_classes(metric_name, sk_metrics, y_true, y_pred))
        else:
            metrics.update({metric_name: calculate_metric_one_class(metric_name, sk_metrics, y_true, y_pred)})
    return df_metrics.append(pd.Series(metrics), ignore_index=True)

def run_experiment(skf, clf):
    df_metrics = pd.DataFrame()
    for train_index, test_index in skf.split(X, labels):
        clf.fit(X[train_index], labels[train_index])

        y_pred = clf.predict(X[test_index])
        y_true = labels[test_index]
        y_pred_proba = clf.predict_proba(X[test_index])
    
        df_metrics = calculate_metrics(df_metrics, METRICS_TO_REPORT, sk_metrics, y_true, y_pred, y_pred_proba)
    return df_metrics

In [15]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_IP}:5000")
mlflow.set_experiment(DATA_VERSION)

skf = StratifiedKFold(n_splits=3)
clf = LogisticRegression()
for clf in MODELS_TO_EXECUTE:
    with mlflow.start_run(run_name=clf.__class__.__name__):
        mlflow.log_param("model", f"{clf.__module__}.{clf.__class__.__name__}")
        mlflow.log_param("model_params", clf.get_params())
        mlflow.log_param("MAX_ALLOWED_TOKENIZED_LEN", MAX_ALLOWED_TOKENIZED_LEN)
        
        df_metrics = run_experiment(skf, clf)
        # Report averages to mlflow
        print(df_metrics)
        for metric, value in df_metrics.mean().to_dict().items():
            mlflow.log_metric(metric, value)

INFO: 'data/v2' does not exist. Creating a new experiment




   average_precision_score  f1_score_0  ...  recall_score_1  roc_auc_score
0                 0.073378    0.923564  ...        0.067797       0.494846
1                 0.073696    0.924658  ...        0.067606       0.495871
2                 0.073843    0.924641  ...        0.070423       0.497167

[3 rows x 8 columns]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

   average_precision_score  f1_score_0  ...  recall_score_1  roc_auc_score
0                 0.184591    0.962120  ...        0.240113       0.611704
1                 0.159867    0.961339  ...        0.194366       0.589845
2                 0.169141    0.960195  ...        0.230986       0.605671

[3 rows x 8 columns]
   average_precision_score  f1_score_0  ...  recall_score_1  roc_auc_score
0                 0.170951    0.965427  ...        0.115819       0.557458
1                 0.160137    0.964916  ...        0.095775       0.547774
2                 0.152355    0.964593  ...        0.090141       0.544845

[3 rows x 8 columns]
