In [1]:
from typing import Tuple, Optional, List

from functools import reduce
from operator import add

from pprint import pprint

import os
from pathlib import Path
PROJ_DIR = Path(os.getcwd()).parent
DATA_DIR = PROJ_DIR / 'data'
CURR_DIR = PROJ_DIR / 'notebooks'
LOGS_DIR = PROJ_DIR / 'logs'
MDLS_DIR = PROJ_DIR / 'models'

import json
import pandas as pd
from datasets import Dataset

from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"


def load_ext_data():
    """Load external data."""
    
    with open(DATA_DIR / "extracts" / "item1a-full-scored-sents.json") as f:
        return json.load(f)


def flatten_nested_list(nested_list: List[List]) -> List:
        """Flatten a nested list."""
        
        return [item for sublist in nested_list for item in sublist]

In [2]:
# Load the dataset
ds = Dataset.from_csv(str(DATA_DIR / "actions" / 'all-annotated.csv')) \
            .train_test_split(test_size=0.1, seed=42, shuffle=True)

Using custom data configuration default-5b7331c94183457a
Reusing dataset csv (/home/dogdog/.cache/huggingface/datasets/csv/default-5b7331c94183457a/0.0.0)


In [3]:
# Load a SetFit model from Hub
model = SetFitModel.from_pretrained(MODEL_NAME)

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=32, # The number of text pairs to generate for contrastive learning
    num_epochs=1,      # The number of epochs to use for contrastive learning
)

# Train and evaluate
trainer.train()
metrics = trainer.evaluate()

# Save to disk
trainer.model._save_pretrained(str(MDLS_DIR / f"{MODEL_NAME}-actions"))

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 30144
  Num epochs = 1
  Total optimization steps = 1884
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1884 [00:00<?, ?it/s]

***** Running evaluation *****


In [4]:
metrics

{'accuracy': 0.9245283018867925}

In [5]:
model = SetFitModel.from_pretrained(str(MDLS_DIR / f"{MODEL_NAME}-actions"))

In [6]:
ext_data = load_ext_data()
test_docs = ["Potential investors should be aware of the difficulties normally encountered by new mineral exploration companies such Oroplata and the high rate of failure of companies such as ours.", 
             "We may also become subject to significant liability for pollution, cave-ins or hazards, which we cannot insure or which we may elect not to insure.",
             "In such a case, we would be unable to complete our business plan and our future shareholders may lose their entire investment.",
             "We believe our operations can provide valuable benefits to surrounding communities, in terms of direct employment, training and skills development and other benefits associated with ongoing payment of taxes.",
             "During the past several years Solitario has conducted an active social engagement program with the community located near the La Promesa project area with the objective of obtaining a community agreement to support exploration activities, including drilling.",
             "From time to time Nexa may enter into surface rights agreements with individual landowners or communities to provide access for exploration work at the Florida Canyon project. Generally, these are short-term agreements.",
             "During 2020 our objectives are to complete an agreement with the local community, to conduct surface exploration, and if warranted, conduct a drilling program.",
             "In addition, we seek to maintain our partnerships and relationships with local communities, including indigenous peoples, and stakeholders in a variety of ways, including in-kind contributions, volunteer time, sponsorships and donations.",
             "We have adopted certain policies and programs, including with respect to responsible production frameworks, climate change, water stewardship, biodiversity, tailings management and stewardship, waste management, safety and health, human capital management, human rights, social performance and community and Indigenous Peoples relations, and supply chain/responsible sourcing.",
             "We have received, and may continue to receive, a high degree of media coverage that is published or otherwise disseminated by third parties, including blogs, articles, online forums, message boards and social and other media.",
             "Newmont has implemented a management system designed to promote continuous improvement in health and safety, environmental performance and community relations.",
             "Our ability to obtain the required permits and approvals to explore for, develop and operate mines and to successfully operate near communities in the jurisdictions in which we operate depends in part on our ability to develop, operate and close mines in a manner that is consistent with the creation of social and economic benefits in the surrounding communities, which may or may not be required by law.",
             "We participated also in a working group with the local and regional authorities and communities to define Company support for their social and community programs.",
             "During 2012, we started negotiations with all the eight unions."]

In [7]:
for p, t in zip(model(test_docs), test_docs):
    print(f"{p}: {t}")

0: Potential investors should be aware of the difficulties normally encountered by new mineral exploration companies such Oroplata and the high rate of failure of companies such as ours.
0: We may also become subject to significant liability for pollution, cave-ins or hazards, which we cannot insure or which we may elect not to insure.
0: In such a case, we would be unable to complete our business plan and our future shareholders may lose their entire investment.
0: We believe our operations can provide valuable benefits to surrounding communities, in terms of direct employment, training and skills development and other benefits associated with ongoing payment of taxes.
1: During the past several years Solitario has conducted an active social engagement program with the community located near the La Promesa project area with the objective of obtaining a community agreement to support exploration activities, including drilling.
0: From time to time Nexa may enter into surface rights agr

In [8]:
for document in ext_data:
    sentences = flatten_nested_list(document["item1a_sentences"])
    for p, t in zip(model(sentences), sentences):
        if p == 1:
            pprint(t)
            print("\n")
    break

('We were incorporated on October 6, 2011, and, to date, we have accumulated a '
 'net loss of $89,334 against no revenue.')


('Thus far, our activities have been primarily limited to organizational '
 'matters, acquiring our mineral claim, obtaining a geology report, '
 'undertaking preliminary exploration work on the Leomary and the preparation '
 'and filing of this registration statement of which this prospectus is a '
 'part.')


'Our Company is in the initial phase of our exploration program on the Leomary.'


('Prior to completion of our exploration activities, we anticipate we will '
 'increases operating expenses on the exploration of the Leomary without '
 'realizing any revenues from the minerals thereon.')


('We have not attempted to locate or negotiate with any suppliers of products, '
 'equipment or materials.')


('We will attempt to locate products, equipment and materials as and when we '
 'are able to raise the requisite capital.')


('We have taken all reasonable s

In [25]:
df = pd.read_csv(DATA_DIR / "statistics" / 'action-sentences-topic-filtered-sample.csv')
df["is_action_setfit"] = model(df["sentence"]) > 0
df.head()

Unnamed: 0,symbol,filing_time,sentence,is_action,topic_ids,is_active,is_action_setfit
0,cde,2006-03,"On September 12, 2005, SEACC, the Sierra Club ...",False,"[2, 11, 15]",True,True
1,rgld,2012-08,"For example, Argentina recently passed a feder...",False,"[2, 5]",True,False
2,amr,2022-03,Significant economic disruptions can result fr...,False,[13],True,False
3,arch,2014-02,The treating of AMD can be costly.,False,"[2, 15]",True,False
4,bmix,2022-03,Our ability to recruit and assimilate new pers...,True,"[8, 14]",True,False


In [28]:
df.to_csv(DATA_DIR / "statistics" / 'action-sentences-topic-filtered-sample.csv', index=False)

In [26]:
df_action = df[df["is_action_setfit"]]

In [27]:
for s in df_action["sentence"]:
    pprint(s)
    print("\n")

('On September 12, 2005, SEACC, the Sierra Club and Lynn Canal Conservation '
 'filed a lawsuit in Federal District Court in Alaska challenging the permits '
 'issued by the Corps of Engineers and the US Forest Service and on November '
 '8, 2005, the Corps of Engineers filed a Motion for Voluntary Remand with the '
 'court to review the permit issued to the Company under the Clean Water Act '
 '(CWA) Section 404 and requested that the court stay the legal proceeding '
 'filed by SEACC and the other environmental groups pending the outcome of '
 'review.')


('Yanacocha is currently assessing treatment options in connection with the '
 'new water quality standards.')


('Under this consent decree, we have agreed to suspend certain activities '
 'until the permitting process is complete and the State of Utah has agreed to '
 'expedite that process.')


('While we believe our contractors employ safety standards and other '
 'procedures to ensure these projects are completed with proper g