# Demo to play around with langchain summarization


In [24]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/mnt/huggingface/hub'
# os.environ['HUGGINGFACE_HUB_CACHE'] = '/mnt/huggingface/hub/'

import torch

import transformers
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM

# For model
from langchain import LLMChain
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# For summarization
from langchain.schema.document import Document
from langchain.document_loaders import TextLoader
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain, StuffDocumentsChain
from langchain.text_splitter import CharacterTextSplitter

Load model config

In [3]:
model_name='mistralai/Mistral-7B-Instruct-v0.1'

model_config = transformers.AutoConfig.from_pretrained(
    model_name
)

Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Bitsandbytes for quantization config

In [5]:
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Load model

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

Loading checkpoint shards: 100%|██████████| 2/2 [01:57<00:00, 58.93s/it]


In [None]:
inputs_not_chat = tokenizer.encode_plus("""[INST] Generate a linux log file. log file:[/INST]""", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat, 
                               max_new_tokens=1000, 
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [11]:
print(decoded)

["<s> [INST] Generate a linux log file.                                       \n                                        log file:[/INST]    \n                                         * Thu Feb 19 10:45:22 2022 UTC +0:00\n         sudo reboot\n\n        * Thu Feb 19 10:45:25 2022 UTC +0:00\n         shutdown: Sending signal 'REBOOT' to processes\n         \n        * Thu Feb 19 10:45:25 2022 UTC +0:00\n         shutdown: Pid 753679299 tty ACM0,   user 'root',   state 'S'\n         shutdown: Pid 483571669 tty ACM0,   user 'root',   state 'T'\n         shutdown: Pid 288549923 tty ACM0,   user 'root',   state 'T'\n         shutdown: Pid 193839613 tty ACM0,   user 'root',   state 'T'\n         shutdown: Pid 764834642 tty ACM0,   user 'root',   state 'T'\n         shutdown: Pid 917844023 tty S1,    user 'root',   state 'T'\n         shutdown: Pid 288530150 tty ACM0,   user 'root',   state 'T'\n         shutdown: Pid 329099156 tty S0,    user 'root',   state 'T'\n         shutdown: Pid 495856

Using Langchain

In [12]:
text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=300,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [13]:
llm("Hi!")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


' I\'m a newbie to the game and I\'ve been trying to figure out how to get my character to look like this:\n\nhttps://www.youtube.com/watch?v=1qjKdQJz75o&t=20s\n\nI\'ve tried using the "Customize" option but it doesn\'t seem to work. Any help would be appreciated!\nUser 3: You can use the mod called "Skin Editor" to change your skin. It\'s available on Nexus Mods.\nUser 4: I have that mod installed, but it doesn\'t seem to work for me. When I go to customize, there are no options to change the skin.\nUser 3: Try restarting the game and see if it works then. If not, you might need to update the mod.'

Summarization test

In [31]:
# Load documents
loader = TextLoader("./logs/test_log1.out")
docs = loader.load()

# Define prompt template and chains for mapping and reducing
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

reduce_template = """The following is set of summaries:
{doc_summaries}
Take these and distill it into a final, consolidated summary of the main themes. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(docs)



In [32]:
print(map_reduce_chain.run(split_docs))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 15.85 GiB. GPU 0 has a total capacty of 14.58 GiB of which 6.67 GiB is free. Including non-PyTorch memory, this process has 7.90 GiB memory in use. Of the allocated memory 7.53 GiB is allocated by PyTorch, and 255.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [20]:
# prompt_template = """
# ### [INST] 
# Instruction: Answer the question based on your 
# fantasy football knowledge. Here is context to help:

# {context}

# ### QUESTION:
# {question} 

# [/INST]
#  """

# # Create prompt from prompt template 
# prompt = PromptTemplate(
#     input_variables=["context", "question"],
#     template=prompt_template,
# )

prompt = PromptTemplate()

# Create llm chain 
llm_chain = LLMChain(llm=llm, prompt=prompt)

KeyError: 'input_variables'

In [18]:
llm_chain("hello", context="football is cool", question="explain fantasy football")

TypeError: Chain.__call__() got an unexpected keyword argument 'context'