In [1]:
# load the sentence-bert model from the HuggingFace model hub
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

tokenizer = AutoTokenizer.from_pretrained('deepset/sentence_bert')
model = AutoModel.from_pretrained('deepset/sentence_bert')

sentence = 'Who are you voting for in 2020?'
labels = ['business', 'art & culture', 'politics']

# run inputs through model and mean-pool over the sequence
# dimension to get sequence-level representations
inputs = tokenizer.batch_encode_plus([sentence] + labels,
                                     return_tensors='pt',
                                     pad_to_max_length=True)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
output = model(input_ids, attention_mask=attention_mask)[0]
sentence_rep = output[:1].mean(dim=1)
label_reps = output[1:].mean(dim=1)

# now find the labels with the highest cosine similarities to
# the sentence
similarities = F.cosine_similarity(sentence_rep, label_reps)
closest = similarities.argsort(descending=True)




In [9]:
print(similarities)
# print(sentence_rep, label_reps)
for i,j in zip(labels, similarities):
    print(i,float(j))

tensor([ 0.0045, -0.0274,  0.2156], grad_fn=<SumBackward1>)
business 0.004524152725934982
art & culture -0.02739686146378517
politics 0.2156151533126831


Bad pipe message: %s [b"\xbc\xc8q'\xa9\xa1.\x89\xd14\x9dg\x8f\xa7\xd1hi\x91\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00"]
Bad pipe message: %s [b'Z\xf6l#\x0eA\xcd)\xfc\xd1\xe4\xa6\xd8a\xb9[\x0b\x80 \xfb\xbd\x7f\x0b\x8dPN\xbe\xb6\xb7Z[#\x84\xdb\xda(\x8d\xcf\xb3\xc8,\xa2\xc3n\x80!\x91\xa0\x10\xd3\x84\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 C[\xbe\x07E\x84\xfa\xa9f\x96L\x0f\xd2']
Bad pipe message: %s [b":; ;//

In [6]:
print(similarities)
for i in 

tensor([ 0.0045, -0.0274,  0.2156], grad_fn=<SumBackward1>)


In [None]:
for ind in closest:
    print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')

print('Script 1 Ran')

label: politics 	 similarity: 0.2156151533126831
label: business 	 similarity: 0.004524152725934982
label: art & culture 	 similarity: -0.02739686146378517
Script 1 Ran


In [9]:
res

NameError: name 'res' is not defined

In [None]:
# load model pretrained on MNLI
from transformers import BartForSequenceClassification, BartTokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli')
model = BartForSequenceClassification.from_pretrained('facebook/bart-large-mnli')

# pose sequence as a NLI premise and label (politics) as a hypothesis
premise = 'Who are you voting for in 2020?'
hypothesis = 'This text is about politics.'

# run through model pre-trained on MNLI
input_ids = tokenizer.encode(premise, hypothesis, return_tensors='pt')
logits = model(input_ids)[0]

# we throw away "neutral" (dim 1) and take the probability of
# "entailment" (2) as the probability of the label being true 
entail_contradiction_logits = logits[:,[0,2]]
probs = entail_contradiction_logits.softmax(dim=1)
true_prob = probs[:,1].item() * 100
print(f'Probability that the label is true: {true_prob:0.2f}%')
print('Script 2 Ran!')

In [None]:
# Testing
test = model(input_ids)

In [None]:
test_metric = logits[:,[0,2]]
test_metric

tensor([[-2.5443,  1.3904]], grad_fn=<IndexBackward0>)

In [None]:
# print the resulting probabilities (only comparing positive and negative)
print(logits[:,[0,2]].softmax(dim=1))

# print the resulting probabilities (including the neutral option)
print(logits.softmax(dim=1))

tensor([[0.0192, 0.9808]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0121, 0.3699, 0.6181]], grad_fn=<SoftmaxBackward0>)


In [None]:
# USING MODEL RAVI FOUND- Worked!
from transformers import BartForSequenceClassification, BartTokenizer
tokenizer = BartTokenizer.from_pretrained('joeddav/bart-large-mnli-yahoo-answers')
model = BartForSequenceClassification.from_pretrained('joeddav/bart-large-mnli-yahoo-answers')

# pose sequence as a NLI premise and label (politics) as a hypothesis
premise = 'Who are you voting for in 2020?'
hypothesis = 'This text is about politics.'

# run through model pre-trained on MNLI
input_ids = tokenizer.encode(premise, hypothesis, return_tensors='pt')
logits = model(input_ids)[0]

# we throw away "neutral" (dim 1) and take the probability of
# "entailment" (2) as the probability of the label being true 
entail_contradiction_logits = logits[:,[0,2]]
probs = entail_contradiction_logits.softmax(dim=1)
true_prob = probs[:,1].item() * 100
print(f'Probability that the label is true: {true_prob:0.2f}%')
print('Script 3 Ran!')

Probability that the label is true: 98.46%
Script 3 Ran!


# Bring it all together

In [1]:
from src.make_models.topic_tagger import Inference
import logging

  from .autonotebook import tqdm as notebook_tqdm
2024-06-04 12:28:33.618307: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
logger = logging.getLogger(__name__)
logging.basicConfig(filename="/home/azureuser/cloudfiles/code/Users/Michael.Sowter/Deep_Learning_Training/inference.log", level=logging.INFO, filemode="w")
tci = Inference(topic="", logger=logger, pdf_filepath="/home/azureuser/cloudfiles/code/Users/Michael.Sowter/Deep_Learning_Training/Text Classifier/Input_Data/Overview.pdf", out_filepath="/home/azureuser/cloudfiles/code/Users/Michael.Sowter/Deep_Learning_Training/Text Classifier/Output_Data/test.json", embeddings_model_name="avsolatorio/GIST-small-Embedding-v0", tuned_model_name="")
parsed_file = tci.parse_file()
chunks = tci.pdf_splitter(parsed_file)

2024-06-04 12:28:43,990 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
2024-06-04 12:28:44,782 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
2024-06-04 12:28:45,188 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [3]:
def get_probabs(premise, hypothesis, tokenizer, model):

    # run through model pre-trained on MNLI
    input_ids = tokenizer.encode(premise, hypothesis, return_tensors='pt', truncation=True)
    logits = model(input_ids)[0]
    entail_contradiction_logits = logits[:,[0,2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    true_prob = probs[:,1].item()
    return true_prob


In [6]:
from transformers import BartForSequenceClassification, BartTokenizer
tokenizer = BartTokenizer.from_pretrained('joeddav/bart-large-mnli-yahoo-answers')  # USING MODEL RAVI FOUND- Worked!
model = BartForSequenceClassification.from_pretrained('joeddav/bart-large-mnli-yahoo-answers')  #  USING MODEL RAVI FOUND- Worked!


tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli')
model = BartForSequenceClassification.from_pretrained('facebook/bart-large-mnli')

In [8]:
Topics = ["approach to the codes", "register of risks", "automated content moderation (user to user)", "governance and accountability", "icjg", "user reporting and complaints (u2u and search)", """service’s risk assessment""", "content moderation (user to user)", "user access to services (u2u)", "enhanced user control (u2u)"]# ["approach to the codes", "automated content moderation (user to user)", "governance and accountability"]


for index, chunk in enumerate(chunks):
    print("\n\n\n")
    premise = chunk.page_content.replace('\n\n', '')
    # premise = chunks[0].page_content.replace('\n\n', '')
    for topic in Topics:
        hypothesis = f'This text is about {topic}.'

        true_prob = get_probabs(premise, hypothesis, tokenizer, model)

        print(f'Probability that topic "{topic}" is true in datapoint "{index}": {true_prob:0.2f}')

print('Script 3 Ran!')

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.








Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Probability that topic "approach to the codes" is true in datapoint "0": 0.80


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Probability that topic "register of risks" is true in datapoint "0": 0.69


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Probability that topic "automated content moderation (user to user)" is true in datapoint "0": 0.58


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Probability that topic "governance and accountability" is true in datapoint "0": 0.94


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Probability that topic "icjg" is true in datapoint "0": 0.81


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Probability that topic "user reporting and complaints (u2u and search)" is true in datapoint "0": 0.74


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Probability that topic "service’s risk assessment" is true in datapoint "0": 0.91


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Probability that topic "content moderation (user to user)" is true in datapoint "0": 0.54


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Probability that topic "user access to services (u2u)" is true in datapoint "0": 0.65
Probability that topic "enhanced user control (u2u)" is true in datapoint "0": 0.64




Probability that topic "approach to the codes" is true in datapoint "1": 0.89
Probability that topic "register of risks" is true in datapoint "1": 0.78
Probability that topic "automated content moderation (user to user)" is true in datapoint "1": 0.03
Probability that topic "governance and accountability" is true in datapoint "1": 0.80
Probability that topic "icjg" is true in datapoint "1": 0.44
Probability that topic "user reporting and complaints (u2u and search)" is true in datapoint "1": 0.04
Probability that topic "service’s risk assessment" is true in datapoint "1": 0.71
Probability that topic "content moderation (user to user)" is true in datapoint "1": 0.08
Probability that topic "user access to services (u2u)" is true in datapoint "1": 0.28
Probability that topic "enhanced user control (u2u)" is true in da

In [10]:
# Save to JSON
res = tci.create_or_load_json(chunks)
# res[i][self.topic] = infer_res['score']
res

{'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nOverview\n\n\n \n\n \n\n \n \n\nOverview \nThis document is the first of four major consulta�ons that Ofcom, as appointed regulator of the \nnew Online Safety Act (‘the Act’), will publish as part of our work to establish the new regula�ons \nover the next 18 months. It focuses on our proposals for how internet services which enable the sharing of user generated \ncontent (‘user-to-user’ or ‘U2U’ services) and search services should approach their new du�es \nrela�ng to illegal content. It covers the following areas: the causes and impacts of illegal harms; how \nservices should assess and mi�gate the risks of illegal harms; how services can iden�fy illegal \ncontent; and our approach to enforcement. The proposals in this document reflect research we have conducted over the past three years as well \nas informa�on and evidence gathered through extensive engagement with industry and other \nexperts. Causes and