In [1]:
from llama_cpp import Llama
import re
import string

#model obtained from https://gpt4all.io/index.html?s=09
path = "C:/Users/georg/Downloads/orca-mini-3b.ggmlv3.q4_0.bin"
llm = Llama(path, logits_all = True, n_gpu_layers=100000)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


In [2]:
#privateChars is a list of 10 unicode private characters which have no universal interpretation and as such using these will not interrupt standard strings 
privateChars = ["\uE000", "\uE001", "\uE002", "\uE003", "\uE004", "\uE005", "\uE006", "\uE007", "\uE008", "\uE009"]

def generateText(prompt, tokens = 1, temperature = 0, topP = 0.5, echo = False,stop = ["#"]):
    #generateText is a wrapped function which predicts the next set of tokens using a predefined llm
    output = llm(prompt, max_tokens = tokens, temperature = temperature, top_p = topP, 
                 echo=echo, stop=stop, logprobs = 10)
    
    return output

def removeSpaces(inputString):
    #removeSpaces is a wrapper function to remove all spaces from a string
    translator = str.maketrans('', '', " ") 
    return inputString.translate(translator)

def encode(plain, searchLength = 10):
    #split the plain text string into a list of individual words
    words = plain.split()
    
    #iterate backwards through words in order to encode incrementally
    for i, word in reversed(list(enumerate(words))):

        #get the searchLength words before the target string and create a string of them
        startInd = max(0, i - searchLength)
        plainSection = " ".join(words[startInd: i])
        
        #use generateText to get a dictionary of the predictions of the next word from the llm
        output = generateText(plainSection)

        #remove all spaces from the target string
        word = removeSpaces(word)

        try:
            #reformat the dictionary of the predictions to have all spaces removed
            choiceDict = output["choices"][0]["logprobs"]["top_logprobs"][0]
            choiceDict  = {removeSpaces(key) : value for key, value in choiceDict.items()}
        except:
            #if the predictions fail then an empty dictionary will be returned
            choiceDict = {}

        #check to see if the target string is one of the predicted strings and if so replace the string with the corresponding 
        #unicode private character, where the n th most likely word is replaced with the n th private character
        if word in choiceDict:
            keys = list(choiceDict.keys())
            words[i] = privateChars[keys.index(word)]

    #rejoin the words into text
    return " ".join(words)

def decode(encoded, searchLength = 10):
    #split the encoded string into a list of individual words
    words = encoded.split()

    #create a pattern which finds all the private characters in the list words and gets their locations
    pattern = "|".join(re.escape(char) for char in privateChars)
    preds  = [i for i, s in enumerate(words) if re.search(pattern, s)]
    
    #itterate over all of the encoded words
    for pred in preds:

        #get the searchLength words before the encoded word and create a string of them
        startInd = max(0, pred - searchLength)
        plainSection = " ".join(words[startInd: pred])

        #use generateText to get a dictionary of the predictions of the unencoded word from the llm
        output = generateText(plainSection)

        #reformat the dictionary of the predictions to have all spaces removed
        choiceDict = output["choices"][0]["logprobs"]["top_logprobs"][0]
        choiceDict = {removeSpaces(key) : value for key, value in choiceDict.items()}
        
        #get a list of the most likely options for the encoded word 
        keys = list(choiceDict.keys())

        #replace the private character for the corresponding prediction
        wordChoice = privateChars.index(words[pred])
        words[pred] = keys[wordChoice].strip()

    #rejoin the words into text  
    return " ".join(words)
    

In [6]:
#This is a unit test to check the encoding and decoding is lossless. This not put in a standard unit test file as the llm is very computationally expensive
testPlain = "A large language model (LLM) is a language model characterized by its large size. Their size, which could be as large as 180 billion parameters, is enabled by AI accelerators, which are able to process vast amounts of text data, mostly scraped from the Internet."
testEncoded = encode(testPlain, 5)
testDecoded = decode(testEncoded, 5)

assert testPlain == testDecoded
print("Decoded correctly")

Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.gene

Decoded correctly


In [3]:
#This is an example of loading a text file and then encoding and decoding it
#I have found this process to be very time consuming and leads to a 20 to 30 percent reduction in string length
f = open("toEncode.txt")
plain = f.read()
f.close()
plain = plain.replace("\n", "")

encoded = encode(plain, 10)
print(encoded)
decoded = decode(encoded, 10)
print(decoded)

Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.gene

Time to encode: 92.63611102104187
A large language  (LLM)   language  characterized    size. Their size,    as   180  parameters,  enabled  AI accelerators,   able      text data, mostly scraped   Internet.


Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit


Time to decode: 51.388925075531006
A large language model (LLM) is a language model characterized by its large size. Their size, which could be as large as 180 billion parameters, is enabled by AI accelerators, which are able to process vast amounts of text data, mostly scraped from the Internet.
Decoded correctly
0.7480916030534351


In [34]:
#this is another usecase i thought of after starting the project
#convert the code number to the correspoding private character and then turn that into a string
codeNumber = 35312902593339444703742
codeStr = str(codeNumber)
codeStr = [privateChars[int(codeChar)] for codeChar in codeStr]
codeStr = " ".join(codeStr)

#add the context string to the start of the code string and then decode the string to the cipher text
contextStr = "The best reality "
contextLen = len(contextStr.split())
cipherStr = decode(contextStr + codeStr, 20)

#using the same llm settings unencrypt the string and then return the codeNumber from before
decodedChar = encode(cipherStr, 20).split()[contextLen: ]
decodedNum = [privateChars.index(char) for char in decodedChar]
decodedNumber = int(''.join(map(str, decodedNum)))

print(codeNumber)
print(cipherStr)
print(decodedNumber)

Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.generate: prefix-match hit
Llama.gene

35312902593339444703742
The best reality television of recent times is , which has made me appreciate and appreciate real stories more and to be a real life hero
35312902593339444703742
