In [1]:
from typing import List, Tuple, Dict

import os
import sys
sys.path.append("..")

import numpy as np
import pandas as pd

from datasets import Dataset

from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer

from src.misc import paths


source_model_name = "sentence-transformers/all-mpnet-base-v2"
target_model_name = f"{source_model_name}-denoise"


def model_init(params: Dict = None) -> SetFitModel:
    params = params or {}
    params = {
        "head_params": {
            "max_iter": params.get("max_iter", 256),
            "solver":   params.get("solver", "liblinear"),
        }
    }
    model = SetFitModel.from_pretrained(source_model_name, **params)
    return model


def hp_space(trial) -> Dict:
    return {
        "learning_rate":  trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_epochs":     trial.suggest_categorical("num_epochs", [1]),
        "batch_size":     trial.suggest_categorical("batch_size", [16, 32]),
        "num_iterations": trial.suggest_categorical("num_iterations", [16, 32]),
    }

In [2]:
dataset_pd = pd.read_csv(paths.data / "denoise" / "denoise-validated.csv")
dataset_pd.head()

Unnamed: 0,tony_not_noise,anne_not_noise,text,label,label_name
0,,,So the overall investment amounted to some RUB...,0,keep
1,,,We remain well positioned at the end of second...,0,keep
2,,,I hope this slide makes it clear that Coeur's ...,0,keep
3,,,"And then, obviously, on the Outotec businesses...",0,keep
4,,,The budget also includes $18.3 million for San...,0,keep


In [3]:
dataset_ds = Dataset.from_pandas(dataset_pd) \
                    .train_test_split(test_size=0.2, seed=42, shuffle=True)

In [4]:
trainer = SetFitTrainer(train_dataset=dataset_ds["train"],
                        eval_dataset=dataset_ds["test"],
                        model_init=model_init,
                        loss_class=CosineSimilarityLoss,
                        metric="f1",
                        seed=42)
best_run = trainer.hyperparameter_search(hp_space, n_trials=10, direction="maximize")

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
[32m[I 2023-04-04 01:59:21,558][0m A new study created in memory with name: no-name-379d06b4-18b0-45e4-a29d-3aae79f2e8ef[0m
Trial: {'learning_rate': 8.500393185376232e-06, 'num_epochs': 1, 'batch_size': 32, 'num_iterations': 32}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 13056
  Num epochs = 1
  Total optimization steps = 408
  Total train batch size = 32


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/408 [00:00<?, ?it/s]

***** Running evaluation *****
[32m[I 2023-04-04 02:07:17,835][0m Trial 0 finished with value: 0.7647058823529411 and parameters: {'learning_rate': 8.500393185376232e-06, 'num_epochs': 1, 'batch_size': 32, 'num_iterations': 32}. Best is trial 0 with value: 0.7647058823529411.[0m
Trial: {'learning_rate': 1.8736779609851885e-06, 'num_epochs': 1, 'batch_size': 16, 'num_iterations': 32}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 13056
  Num epochs = 1
  Total optimization steps = 816
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/816 [00:00<?, ?it/s]

***** Running evaluation *****
[32m[I 2023-04-04 02:15:43,906][0m Trial 1 finished with value: 0.6875000000000001 and parameters: {'learning_rate': 1.8736779609851885e-06, 'num_epochs': 1, 'batch_size': 16, 'num_iterations': 32}. Best is trial 0 with value: 0.7647058823529411.[0m
Trial: {'learning_rate': 1.077715639077575e-05, 'num_epochs': 1, 'batch_size': 16, 'num_iterations': 16}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 6528
  Num epochs = 1
  Total optimization steps = 408
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/408 [00:00<?, ?it/s]

***** Running evaluation *****
[32m[I 2023-04-04 02:19:54,542][0m Trial 2 finished with value: 0.7272727272727272 and parameters: {'learning_rate': 1.077715639077575e-05, 'num_epochs': 1, 'batch_size': 16, 'num_iterations': 16}. Best is trial 0 with value: 0.7647058823529411.[0m
Trial: {'learning_rate': 3.338073147619401e-05, 'num_epochs': 1, 'batch_size': 32, 'num_iterations': 32}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 13056
  Num epochs = 1
  Total optimization steps = 408
  Total train batch size = 32


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/408 [00:00<?, ?it/s]

***** Running evaluation *****
[32m[I 2023-04-04 02:27:35,462][0m Trial 3 finished with value: 0.7272727272727272 and parameters: {'learning_rate': 3.338073147619401e-05, 'num_epochs': 1, 'batch_size': 32, 'num_iterations': 32}. Best is trial 0 with value: 0.7647058823529411.[0m
Trial: {'learning_rate': 1.664855845283107e-05, 'num_epochs': 1, 'batch_size': 16, 'num_iterations': 16}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 6528
  Num epochs = 1
  Total optimization steps = 408
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/408 [00:00<?, ?it/s]

***** Running evaluation *****
[32m[I 2023-04-04 02:31:44,590][0m Trial 4 finished with value: 0.7272727272727272 and parameters: {'learning_rate': 1.664855845283107e-05, 'num_epochs': 1, 'batch_size': 16, 'num_iterations': 16}. Best is trial 0 with value: 0.7647058823529411.[0m
Trial: {'learning_rate': 1.0984717119181637e-05, 'num_epochs': 1, 'batch_size': 16, 'num_iterations': 16}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 6528
  Num epochs = 1
  Total optimization steps = 408
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/408 [00:00<?, ?it/s]

***** Running evaluation *****
[32m[I 2023-04-04 02:34:46,509][0m Trial 5 finished with value: 0.7272727272727272 and parameters: {'learning_rate': 1.0984717119181637e-05, 'num_epochs': 1, 'batch_size': 16, 'num_iterations': 16}. Best is trial 0 with value: 0.7647058823529411.[0m
Trial: {'learning_rate': 1.0658404027496414e-05, 'num_epochs': 1, 'batch_size': 16, 'num_iterations': 16}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 6528
  Num epochs = 1
  Total optimization steps = 408
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/408 [00:00<?, ?it/s]

***** Running evaluation *****
[32m[I 2023-04-04 02:37:40,302][0m Trial 6 finished with value: 0.7272727272727272 and parameters: {'learning_rate': 1.0658404027496414e-05, 'num_epochs': 1, 'batch_size': 16, 'num_iterations': 16}. Best is trial 0 with value: 0.7647058823529411.[0m
Trial: {'learning_rate': 9.783528150044016e-05, 'num_epochs': 1, 'batch_size': 32, 'num_iterations': 32}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 13056
  Num epochs = 1
  Total optimization steps = 408
  Total train batch size = 32


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/408 [00:00<?, ?it/s]

***** Running evaluation *****
[32m[I 2023-04-04 02:43:05,450][0m Trial 7 finished with value: 0.7878787878787877 and parameters: {'learning_rate': 9.783528150044016e-05, 'num_epochs': 1, 'batch_size': 32, 'num_iterations': 32}. Best is trial 7 with value: 0.7878787878787877.[0m
Trial: {'learning_rate': 1.543319881556778e-05, 'num_epochs': 1, 'batch_size': 32, 'num_iterations': 16}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 6528
  Num epochs = 1
  Total optimization steps = 204
  Total train batch size = 32


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

***** Running evaluation *****
[32m[I 2023-04-04 02:45:50,073][0m Trial 8 finished with value: 0.7272727272727272 and parameters: {'learning_rate': 1.543319881556778e-05, 'num_epochs': 1, 'batch_size': 32, 'num_iterations': 16}. Best is trial 7 with value: 0.7878787878787877.[0m
Trial: {'learning_rate': 2.8639426399654712e-05, 'num_epochs': 1, 'batch_size': 32, 'num_iterations': 32}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 13056
  Num epochs = 1
  Total optimization steps = 408
  Total train batch size = 32


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/408 [00:00<?, ?it/s]

***** Running evaluation *****
[32m[I 2023-04-04 02:51:15,251][0m Trial 9 finished with value: 0.7647058823529411 and parameters: {'learning_rate': 2.8639426399654712e-05, 'num_epochs': 1, 'batch_size': 32, 'num_iterations': 32}. Best is trial 7 with value: 0.7878787878787877.[0m


In [5]:
trainer.apply_hyperparameters(best_run.hyperparameters, final_model=True)
trainer.train()

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 13056
  Num epochs = 1
  Total optimization steps = 408
  Total train batch size = 32


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/408 [00:00<?, ?it/s]

In [6]:
trainer.model.save_pretrained(os.path.join(paths.model / target_model_name))
model = SetFitModel.from_pretrained(os.path.join(paths.model / target_model_name))

In [7]:
texts = ["Thank you, Andrew, and good afternoon to everyone.",
         "Following the remarks made by management, we'll open the call for Q&A.",
         "Welcome to our third quarter 2015 earnings call.",
         "It is available during this call on our website under the SEC Filings tab.",
         "We will now begin the question-and-answer session.",
         "With us today are our Chairman and CEO, Greg Boyce; Executive Vice President and Chief Financial Officer, Mike Crews; and President and Chief Commercial Officer, Rick Navarre.We do have some forward-looking statements, and they should be considered along with the risk factors that we note at the end of our release, as well as the MD&A sections of our filed documents.",
         "And that's what's really happening.",
         "And as a Phil mentioned, I started my career with Hecla Mining Company over 40 years ago, and I'm pleased to be ending it here with the company reporting record financial results.Slide 6 sets forth a few key production and financial metrics.",
         "So it is an efficient use of capital.Moving on to San Sebastian.",
         "John, let me interrupt you.",
         "Look forward to hearing from you.",
         "Okay, well, I think that -- I'm sorry, go ahead operator.",
         "We're available if you have any further questions, call Mike or call me.",
         "Thanks very much for joining us today.",    
         "And they should be considered along with the risk factors that we note at the end of our release, as well as the MD&A sections of our filed documents.",    "Following the remarks made by management, we'll open the call for Q&A.",    "Welcome to our third quarter 2015 earnings call.",    "It is available during this call on our website under the SEC Filings tab.",    
         "We will now begin the question-and-answer session.",
         "And that has been a great thing for us to see.",
         'I hope you and your families are doing well, both physically and mentally.',
         "So while there's a lot of discussion about price sensitivity, they need the coal.",
         "With the silver price up about $3 from the first quarter to the second, second quarter is on a path of significant growth in our cash flow, but I'm getting ahead of myself.To have these results, we've maintained investment in all of our properties.",
         "At this point, the focus is on extending what we're doing now, which is using the leased mill and contract miner.",
         "We've got the excavation done.",
         "We still have another 6, 8 months of work that we have to do to equip it.", 
         "We're now over 7 months into this COVID-19 pandemic, and I'm happy to say the company is in good shape, and we're excited about the future of our industry.", 
         'In fact, I would say that over the course of this year, our belief in a bright future for our industry has strengthened.', 
         "That's why we remain a pure-play supplier of the uranium fuel needed to produce clean, carbon-free baseload electricity.", 
         'We also remain very bullish on the uranium market.', 
         "Well, first, around the globe, we're seeing an increasing focus on electrification for various reasons.", 
         'There are those that are installing baseload power.', 
         'Then there are those who are looking for a reliable replacement to fossil fuel sources.', 
         "And finally, there's new demand for things like the electrification of transportation.", 
         'This is occurring precisely at the same time countries around the world are focused on decarbonization.', 
         'And that has led to the recognition from a policy point of view that nuclear will be needed in the toolbox to sustainably achieve both electrification and decarbonization at the same time.', 
         'China, for example, who has a goal to have 25 million electric vehicles on the road by 2030, recently stated that its objective is to become carbon neutral before 2060.', 
         'The follow-on study from a climate scientist in that country predicted that to achieve this goal will require an estimated quadrupling of nuclear power capacity in that country.', 
         'That would be about 200 reactors for China alone, double that of the U.S. fleet, which is currently the largest in the world.', 
         'So demand for nuclear is increasing.']

for p, s in zip(model(texts), texts):
    print(f"{'noise' if p else 'keep'}: {s}")

noise: Thank you, Andrew, and good afternoon to everyone.
noise: Following the remarks made by management, we'll open the call for Q&A.
noise: Welcome to our third quarter 2015 earnings call.
noise: It is available during this call on our website under the SEC Filings tab.
noise: We will now begin the question-and-answer session.
keep: With us today are our Chairman and CEO, Greg Boyce; Executive Vice President and Chief Financial Officer, Mike Crews; and President and Chief Commercial Officer, Rick Navarre.We do have some forward-looking statements, and they should be considered along with the risk factors that we note at the end of our release, as well as the MD&A sections of our filed documents.
noise: And that's what's really happening.
keep: And as a Phil mentioned, I started my career with Hecla Mining Company over 40 years ago, and I'm pleased to be ending it here with the company reporting record financial results.Slide 6 sets forth a few key production and financial metrics.
k