In [43]:
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE StandaloneDeriving #-}

In [44]:
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.ByteString.UTF8 as BSU
import Data.Aeson (decode)
import Data.Word
import qualified Data.Map as Map
import Data.Maybe (fromMaybe)
import System.IO
import Data.List.Split (splitOn)
import Distribution.Simple
import qualified Data.Text as T
import Data.Word (Word8)

type CharMap = Map.Map String Int


In [None]:
data Tokenizer = Tokenizer
    { 
        fullVocab :: CharMap,
        pairs :: [(String, String)]
    } deriving (Generic)

In [29]:
loadJSON :: FilePath -> IO (Maybe CharMap)
loadJSON filePath = do
  content <- B.readFile filePath
  return $ decode content

In [30]:
readFileToPairs :: FilePath -> String -> IO [(String, String)]
readFileToPairs filePath delimiter = do
    content <- readFile filePath
    let lines' = lines content         -- Diviser en lignes
        pairs = map (toPair delimiter) lines'   -- Convertir chaque ligne en tuple
    return pairs
  where
    -- Fonction pour convertir une ligne en tuple (s1, s2)
    toPair :: String -> String -> (String, String)
    toPair delim line = 
        case splitOn delim line of
            [first, second] -> (first, second)  -- Cas normal: deux parties


In [31]:
loadWordsJson :: String -> IO [B.ByteString]
loadWordsJson path = do
  jsonData <- BL.readFile path
  case decode jsonData of
    Just wordList -> return $ map BL.pack wordList
    Nothing -> do
      putStrLn $ "Error: Could not decode JSON from " ++ path
      return []

In [32]:
maybeVocab <- loadJSON "vocab.json"

In [33]:
pairs <- readFileToPairs "merges.txt" " "

In [34]:
loadWordsJson "words.json"

["Hello","World","!","How","are","you"]

In [35]:
getVocab :: Maybe CharMap -> IO CharMap
getVocab maybeVocab =
    case maybeVocab of
        Just vocab -> return vocab
        Nothing    -> do
            putStrLn "Erreur: vocab non chargé"
            return Map.empty
    
fullVocab <- getVocab maybeVocab



In [None]:
initializeTokenizer :: FilePath -> FilePath -> Tokenizer
initializeTokenizer vocabPath mergePath =
    maybeVocab <- loadJSON vocabPath
    mergePairs <- readFileToPairs mergePath " "
    { 
        fullVocab <- getVocab maybeVocab,
        pairs <- readFileToPairs mergePath " "
        
    }
    


In [36]:
toByte :: FilePath -> IO String
toByte path = do
    bs <- BS.readFile path
    let toW8 = BS.unpack bs
    return (BSU.toString (BS.pack toW8))

In [37]:
replaceSpace :: String -> String
replaceSpace input = 
    T.unpack (T.replace (T.pack " ") (T.pack "Ġ") (T.pack input))
    

input <- toByte "testTexte.txt"
inputNoSpace = replaceSpace input
putStrLn inputNoSpace


Salut,ĠjeĠm'appelĠévan,Ġj'aiĠ20Ġans,ĠetĠmeĠvoilaĠenĠtrainĠdeĠprogrammerĠunĠtokenizerĠhahahahĠ😀

In [38]:
makeStrArray :: String -> [String]
makeStrArray =
    map (:[])
aStrNoSpace = makeStrArray inputNoSpace


In [39]:
merges :: [(String, String)] -> [String] -> [String]
merges [] tokens = tokens
merges (pair:pairs) tokens = merges pairs (merge pair tokens)
    
merge :: (String, String) -> [String] -> [String]
merge _ [] = []
merge _ [x] = [x]
merge (a, b) (x1:x2:xs)
  | x1 == a && x2 == b = (a ++ b) : merge (a, b) xs
  | otherwise          = x1 : merge (a, b) (x2:xs)
        
        
tokenize = merges pairs aStrNoSpace
putStrLn (unwords tokenize)

Sal ut , Ġje Ġm ' app el Ġé van , Ġj ' ai Ġ20 Ġans , Ġet Ġme Ġvo ila Ġen Ġtrain Ġde Ġprogrammer Ġun Ġtoken izer Ġha h ahah Ġ 😀

In [40]:
changeToIndex :: [String] -> [Int]
changeToIndex =
    map (\c -> Map.findWithDefault 50257 c fullVocab)
    
indexed = changeToIndex tokenize
putStrLn $ unlines $ map (\(t, i) -> t ++ " -> " ++ show i) (zip tokenize indexed)
indexed

Sal -> 19221
ut -> 315
, -> 11
Ġje -> 11223
Ġm -> 285
' -> 6
app -> 1324
el -> 417
Ġé -> 16268
van -> 10438
, -> 11
Ġj -> 474
' -> 6
ai -> 1872
Ġ20 -> 1160
Ġans -> 9093
, -> 11
Ġet -> 2123
Ġme -> 502
Ġvo -> 7608
ila -> 10102
Ġen -> 551
Ġtrain -> 4512
Ġde -> 390
Ġprogrammer -> 24292
Ġun -> 555
Ġtoken -> 11241
izer -> 7509
Ġha -> 387
h -> 71
ahah -> 36225
Ġ -> 220
😀 -> 50257

[19221,315,11,11223,285,6,1324,417,16268,10438,11,474,6,1872,1160,9093,11,2123,502,7608,10102,551,4512,390,24292,555,11241,7509,387,71,36225,220,50257]

In [41]:
untokenizer :: [Int] -> CharMap -> String
untokenizer tokensId strKey =
    let idsKey = reverseMap strKey
        find = concat (map (\i -> Map.findWithDefault "?" i idsKey) tokensId)
        result = T.unpack (T.replace (T.pack "Ġ") (T.pack " ") (T.pack find))
    in result

reverseMap :: CharMap -> Map.Map Int String
reverseMap vocabMap =
    let vocab = Map.toList vocabMap
        reverseVocab = map (\(a, b) -> (b, a)) vocab
    in Map.fromList reverseVocab


In [42]:
putStrLn (untokenizer indexed fullVocab)

Salut, je m'appel évan, j'ai 20 ans, et me voila en train de programmer un tokenizer hahahah smiley

In [None]:
tokenize :: BS.ByteString -> [Int]
