In [1]:
import pandas as pd
from datasets import load_dataset
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = '/home/philko/Downloads/news.csv'

In [3]:
dataset = load_dataset('csv', data_files=data_path)
dataset = dataset['train'].map(lambda e: {'text': str(e['Description']) + '\n\n' + str(e['Keywords']) + '\n\n' + str(e['Body'])}, remove_columns=['ID', 'Title', 'Description', 'Keywords', 'Body', 'Theme', 'Link'])

In [4]:
dataset['text']

['An eastern Pennsylvania candy factory didn’t evacuate its employees – even as some said they smelled gas – before an explosion in March that left seven people dead and nearly a dozen others injured, the US Department of Labor announced.\n\naccident investigations, accidents, accidents, disasters and safety, brand safety-nsf accidents and disasters, brand safety-nsf sensitive, continents and regions, explosions, gas leaks, government organizations - us, iab-disasters, investigations, labor and employment, national transportation safety board, north america, northeastern united states, occupational safety and health administration, pennsylvania, safety issues and practices, the americas, united states, us federal departments and agencies, us government independent agencies, workplace accidents, workplace health and safety\n\nAn eastern Pennsylvania candy factory didn’t evacuate its employees – even as some said they smelled gas – before an explosion in March that left seven people dead

In [5]:
embeddings = HuggingFaceEmbeddings(model_name="WhereIsAI/UAE-Large-V1")

No sentence-transformers model found with name /home/philko/.cache/torch/sentence_transformers/WhereIsAI_UAE-Large-V1. Creating a new one with MEAN pooling.


In [6]:
vectorstore = FAISS.from_texts(dataset['text'][0:5], embeddings)
vectorstore.save_local("faiss_doc_idx")

In [7]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [8]:
retrieved_docs = retriever.get_relevant_documents(
    "What happend in the Bronx?"
)

In [9]:
dataset["text"][0]

'An eastern Pennsylvania candy factory didn’t evacuate its employees – even as some said they smelled gas – before an explosion in March that left seven people dead and nearly a dozen others injured, the US Department of Labor announced.\n\naccident investigations, accidents, accidents, disasters and safety, brand safety-nsf accidents and disasters, brand safety-nsf sensitive, continents and regions, explosions, gas leaks, government organizations - us, iab-disasters, investigations, labor and employment, national transportation safety board, north america, northeastern united states, occupational safety and health administration, pennsylvania, safety issues and practices, the americas, united states, us federal departments and agencies, us government independent agencies, workplace accidents, workplace health and safety\n\nAn eastern Pennsylvania candy factory didn’t evacuate its employees – even as some said they smelled gas – before an explosion in March that left seven people dead 

In [10]:
retrieved_docs[0].page_content 

'A Bronx day care provider, her husband and his cousin have been indicted on murder and other charges on suspicion of exposing four children – including a 1-year-old who died – to fentanyl at the day care center last month, the Bronx district attorney’s office said Thursday.\n\nbrand safety-nsf crime, brand safety-nsf death, brand safety-nsf sensitive, brand safety-nsf violence, bronx, business and industry sectors, business, economy and trade, child care, children, continents and regions, crime, law enforcement and corrections, crimes against persons, criminal law, criminal offenses, death and dying, demographic groups, domestic alerts, domestic-health and science, domestic-us news, families and children, family members and relatives, homicide, iab-bereavement, iab-business and finance, iab-crime, iab-daycare and pre-school, iab-family and relationships, iab-industries, iab-law, iab-medical health, iab-parenting, iab-pharmaceutical drugs, iab-pharmaceutical industry, indictments, inte

In [11]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# # tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
# model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

tokenizer_config.json: 100%|██████████| 2.54k/2.54k [00:00<00:00, 2.60MB/s]


spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.53MB/s]
special_tokens_map.json: 100%|██████████| 2.20k/2.20k [00:00<00:00, 2.52MB/s]
tokenizer.json: 100%|██████████| 2.42M/2.42M [00:00<00:00, 4.79MB/s]
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
config.json: 100%|██████████| 1.40k/1.40k [00:00<00:00, 131kB/s]
model.safetensors:  25%|██▌       | 252M/990M [01:43<05:04, 2.42MB/s] 


KeyboardInterrupt: 

In [None]:
input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Wie old sind Sie?


In [None]:
question = "What happend in the Bronx?"
context = retrieved_docs[0].page_content 
query = f"Question: {question}. Context {context}. Answer of the question, given the context: "

In [None]:
query

'Question: What happend in the Bronx?. Context A Bronx day care provider, her husband and his cousin have been indicted on murder and other charges on suspicion of exposing four children – including a 1-year-old who died – to fentanyl at the day care center last month, the Bronx district attorney’s office said Thursday.\n\nbrand safety-nsf crime, brand safety-nsf death, brand safety-nsf sensitive, brand safety-nsf violence, bronx, business and industry sectors, business, economy and trade, child care, children, continents and regions, crime, law enforcement and corrections, crimes against persons, criminal law, criminal offenses, death and dying, demographic groups, domestic alerts, domestic-health and science, domestic-us news, families and children, family members and relatives, homicide, iab-bereavement, iab-business and finance, iab-crime, iab-daycare and pre-school, iab-family and relationships, iab-industries, iab-law, iab-medical health, iab-parenting, iab-pharmaceutical drugs, 

In [None]:
outputs = model.generate(tokenizer(query, return_tensors="pt").input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Token indices sequence length is longer than the specified maximum sequence length for this model (1334 > 512). Running this sequence through the model will result in indexing errors


A Bronx day care provider, her husband and his cousin have been indicted on


In [1]:
model_name = "michaelfeil/ct2fast-Llama-2-7b-hf"


from hf_hub_ctranslate2 import GeneratorCT2fromHfHub
model = GeneratorCT2fromHfHub(
        # load in int8 on CUDA
        model_name_or_path=model_name,
        device="cpu",
        compute_type="int8_float16",
        # tokenizer=AutoTokenizer.from_pretrained("{ORG}/{NAME}")
)
outputs = model.generate(
    text=["def fibonnaci(", "User: How are you doing? Bot:"],
    max_length=64,
    include_prompt_in_result=False
)
print(outputs)

  from .autonotebook import tqdm as notebook_tqdm
model.bin: 100%|██████████| 6.74G/6.74G [45:21<00:00, 2.48MB/s]
