In [4]:
import torch
from auto_gptq import AutoGPTQForCausalLM
from huggingface_hub import hf_hub_download
from langchain.llms import HuggingFacePipeline, LlamaCpp
from langchain.prompts import PromptTemplate
from langchain import HuggingFaceHub, LLMChain
# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    LlamaForCausalLM,
    LlamaTokenizer,
    pipeline,
)

In [5]:
def load_model(device_type, model_id, model_basename=None):
    """
    Select a model for text generation using the HuggingFace library.
    If you are running this for the first time, it will download a model for you.
    subsequent runs will use the model from the disk.

    Args:
        device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.
        model_id (str): Identifier of the model to load from HuggingFace's model hub.
        model_basename (str, optional): Basename of the model if using quantized models.
            Defaults to None.

    Returns:
        HuggingFacePipeline: A pipeline object for text generation using the loaded model.

    Raises:
        ValueError: If an unsupported model or device type is provided.
    """
  

    if model_basename is not None:
        if ".ggml" in model_basename:
            model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
            max_ctx_size = 2048
            kwargs = {
                "model_path": model_path,
                "n_ctx": max_ctx_size,
                "max_tokens": max_ctx_size,
            }
            if device_type.lower() == "mps":
                kwargs["n_gpu_layers"] = 1000
            if device_type.lower() == "cuda":
                kwargs["n_gpu_layers"] = 1000
                kwargs["n_batch"] = max_ctx_size
            return LlamaCpp(**kwargs)

        else:
            # The code supports all huggingface models that ends with GPTQ and have some variation
            # of .no-act.order or .safetensors in their HF repo.

            if ".safetensors" in model_basename:
                # Remove the ".safetensors" ending if present
                model_basename = model_basename.replace(".safetensors", "")

            tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

            model = AutoGPTQForCausalLM.from_quantized(
                model_id,
                model_basename=model_basename,
                use_safetensors=True,
                trust_remote_code=True,
                device="cuda:0",
                use_triton=False,
                quantize_config=None,
            )
    elif (
        device_type.lower() == "cuda"
    ):  # The code supports all huggingface models that ends with -HF or which have a .bin
        # file in their HF repo.
        tokenizer = AutoTokenizer.from_pretrained(model_id)

        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
            # max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
        )
        model.tie_weights()
    else:
        tokenizer = LlamaTokenizer.from_pretrained(model_id)
        model = LlamaForCausalLM.from_pretrained(model_id)

    # Load configuration from the model to avoid warnings
    generation_config = GenerationConfig.from_pretrained(model_id)
    # see here for details:
    # https://huggingface.co/docs/transformers/
    # main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns

    # Create a pipeline for text generation
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=2048,
        temperature=0,
        top_p=0.95,
        repetition_penalty=1.15,
        generation_config=generation_config,
    )

    local_llm = HuggingFacePipeline(pipeline=pipe)

    return local_llm

In [None]:
input_string =  """ Sure, here is a horror story in the format of a list:["The old mansion loomed before us", "Crickets chirped in the darkness", "Rachel just returned home", "Her eyes scanned the living room", "But something felt off", "She noticed the flickering candles", "And the eerie silence"]
I hope this helps! Let me know if you have any questions."
"""
print (type(input_string))

<class 'str'>


In [28]:
import re
import ast

input = '''
Sure! Here is a horror story in the format of a python list:

["The old mansion loomed before us", "Creepy noises echoed through the halls", "We had to investigate", "As we ventured deeper into the house", "Ominous shadows danced on the walls", "A faint scent of decay wafted through the air", "Rachel stumbled upon a hidden room", "Inside, we found unspeakable horrors", "Our screams were drowned out by the howling wind", "We fled in terror, but knew we'd never escape"]
'''
def string_to_list(input):

    # Find the list using regular expression
    match = re.search(r'\[.*?\]', input_string, re.DOTALL)

    if match:
        extracted_list_str = match.group(0)
    
        # Convert the extracted string to a list using ast.literal_eval
        extracted_list = ast.literal_eval(extracted_list_str)
    
        return extracted_list
    else:
        print("No list found in the input string.")

string_to_list(input)

['The old mansion loomed before us',
 'Crickets chirped in the darkness',
 'Rachel just returned home',
 'Her eyes scanned the living room',
 'But something felt off',
 'She noticed the flickering candles',
 'And the eerie silence']

In [29]:
MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML"
MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
template = """Create a story using this genre suggestion: {suggestion}
Return the story in the format of a python list data type in chronological order of the story
For example ["It was dark outside", "Rachel just returned homm",".."]
Answer: """

prompt = PromptTemplate(input_variables=['suggestion'], template=template)

llm = load_model("cuda", model_id=MODEL_ID, model_basename=MODEL_BASENAME)

suggestion = 'Horror'
llm_chain = LLMChain(
    prompt=prompt,
    llm=llm
)
output = llm_chain.run(suggestion)
converted_list = string_to_list(output)
print(output)
print(type(output))

print(converted_list)
print(type(converted_list))

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


 Sure! Here is a horror story in the format of a list:

["It was a dark and stormy night", "The old mansion loomed before us", "As we stepped inside", "The creaking of the floorboards beneath our feet", "Suddenly", "A loud crash echoed through the hallway", "We froze", "And then we saw it..."]

I hope you enjoy reading this horror story! Let me know if you have any questions.
<class 'str'>
['The old mansion loomed before us', 'Crickets chirped in the darkness', 'Rachel just returned home', 'Her eyes scanned the living room', 'But something felt off', 'She noticed the flickering candles', 'And the eerie silence']
<class 'list'>
