In [None]:
# code required by Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# code required by Colab
%cd drive/MyDrive/twitter_sentiment_with_mlflow

In [None]:
# install requirements
!pip install -r requirements.txt

In [1]:
# import transformers
import pandas as pd
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import os
import sys
import numpy as np
from datetime import datetime
import mlflow

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Add src as folder from where to import
parent_dir = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '.', 'src'))

# Add this directory to sys.path
sys.path.append(parent_dir)

In [3]:
from settings import (
    MLFLOW_TRACKING_URI,
    MLFLOW_TRACKING_USERNAME,
    MLFLOW_TRACKING_PASSWORD,
)

In [4]:
#Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
base_dir = os.path.dirname(os.getcwd())

train_file = os.path.join(base_dir, r'data\train.parquet')
test_file = os.path.join(base_dir, r'data\test.parquet')

In [6]:
claims_test = pd.read_parquet(train_file, columns=['text']).dropna(subset=['text'])
rel_passage_test = pd.read_parquet(train_file, columns=['text_b']).dropna(subset=['text_b'])

claims_train = pd.read_parquet(test_file, columns=['text']).dropna(subset=['text'])
rel_passage_train = pd.read_parquet(test_file, columns=['text_b']).dropna(subset=['text_b'])

In [7]:
claims_text = claims_test['text'].tolist() + claims_train['text'].tolist()
rel_passage_text = rel_passage_test['text_b'].tolist() + rel_passage_train['text_b'].tolist()

print(len(claims_text))
print(len(rel_passage_text))

3800
3800


In [8]:
pair_data = []

n_samples = 50

claims = claims_text[:n_samples]
rel_passages = rel_passage_text[:n_samples]

def create_non_relevants(index, n=2):
    other_indexs = [i for i in range(len(rel_passages)) if i != index]
    return [i for i in np.random.choice(other_indexs, n)]

# For each claim, create 10 non relevant rel_passages
for index, claim in enumerate(claims):
    pair_data.append({"claim": claim, "rel_passage": rel_passages[index], "label": 1})
    non_relevants_indexs = create_non_relevants(index)
    for non_relevant_index in non_relevants_indexs:
        pair_data.append({"claim": claim, "rel_passage": rel_passages[non_relevant_index], "label": 0})

# Shuffle and split train and test
test_size = 0.2
np.random.shuffle(pair_data)
train_size = int(len(pair_data) * (1 - test_size))
train_data = pair_data[:train_size]
test_data = pair_data[train_size:]

# Save to jsonl
with open("../data/train.json", "w") as f:
    for item in train_data:
        f.write(json.dumps(item) + "\n")

with open("../data/test.json", "w") as f:
    for item in test_data:
        f.write(json.dumps(item) + "\n")

print('Train size: ', len(train_data))
print('Test size: ', len(test_data))

Train size:  120
Test size:  30


In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

encoded = tokenizer("Hello, my dog is cute", "Hello, my cat is amazing", return_tensors="pt")

decoded = tokenizer.decode(encoded["input_ids"][0])
print(decoded)


<s>Hello, my dog is cute</s></s>Hello, my cat is amazing</s>


In [10]:
user = MLFLOW_TRACKING_USERNAME
experiment_name = 'Herman_PatentMatchBaseline'

def timestamp():
    """This function creates current timestamp"""
    return datetime.now().strftime("%Y_%m_%d_%H%M_%S")

In [11]:
# If you have provided the name of an experiment that does not exist or that existed in MLflow but was deleted, create a new experiment.
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None or experiment.lifecycle_stage == 'deleted':
    mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

In [12]:
import datasets
def preprocess_function(batch):
    return tokenizer(batch["claim"], batch["rel_passage"], truncation=True, padding="max_length")

dataset = datasets.load_dataset("json", data_files={"train": "../data/train.json", "test": "../data/test.json"})
tokenized_data = dataset.map(preprocess_function, batched=True)
print(tokenized_data)

Generating train split: 120 examples [00:00, 8572.05 examples/s]
Generating test split: 30 examples [00:00, 3337.91 examples/s]
Map: 100%|██████████| 120/120 [00:00<00:00, 1677.39 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 868.02 examples/s]

DatasetDict({
    train: Dataset({
        features: ['claim', 'rel_passage', 'label', 'input_ids', 'attention_mask'],
        num_rows: 120
    })
    test: Dataset({
        features: ['claim', 'rel_passage', 'label', 'input_ids', 'attention_mask'],
        num_rows: 30
    })
})





In [13]:
import mlflow
import mlflow.sklearn
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from datasets import load_metric
import os

def compute_metrics(eval_pred):
    f1_metric = load_metric("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_metric.add_batch(predictions=predictions, references=labels)
    return f1_metric.compute()

# Start tracking the session in MLflow
with mlflow.start_run(experiment_id=experiment.experiment_id):

    # Log configuration parameters
    mlflow.log_param("model_name", "roberta-base")
    mlflow.log_param("num_train_epochs", 3)
    mlflow.log_param("per_device_train_batch_size", 16)
    mlflow.log_param("per_device_eval_batch_size", 64)
    mlflow.log_param("warmup_steps", 500)
    mlflow.log_param("weight_decay", 0.01)
    mlflow.log_param("logging_steps", 10)

    model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

    training_args = TrainingArguments(
        output_dir="./results",          # output directory
        num_train_epochs=3,              # total # of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=tokenized_data["train"],         # training dataset
        eval_dataset=tokenized_data["test"],           # evaluation dataset
        compute_metrics=compute_metrics,               # the callback that computes metrics of interest
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()

    # Log metrics
    for key, value in eval_results.items():
        mlflow.log_metric(key, value)

    # Generate and log the confusion matrix
    from sklearn.metrics import confusion_matrix
    
    predictions, labels, _ = trainer.predict(tokenized_data["test"])
    preds = np.argmax(predictions, axis=-1)
    cm = confusion_matrix(labels, preds)

    # Save the confusion matrix as CSV and log it as an artifact
    cm_filename = "./confusion_matrix.csv"
    cm_df = pd.DataFrame(cm)
    cm_df.to_csv(cm_filename, index=False)
    mlflow.log_artifact(cm_filename)

mlflow.end_run()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 42%|████▏     | 10/24 [10:52<14:40, 62.92s/it]

{'loss': 0.6996, 'grad_norm': 4.645198822021484, 'learning_rate': 1.0000000000000002e-06, 'epoch': 1.25}


 54%|█████▍    | 13/24 [14:26<12:36, 68.80s/it]