In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [3]:
def load_tex_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        corpus = file.readlines()
    return [line.strip() for line in corpus]

# 사용 예시
filepath = '2301.01754.tex'
corpus = load_tex_file(filepath)
for line in corpus:
    print(line)


\documentclass[12pt]{revtex4-2}
%
\usepackage[utf8]{inputenc}







\numberwithin{equation}{section}



\newcommand{\ytt}{y_{22}}
\newcommand{\yth}{y_{32}}
\DeclareUnicodeCharacter{2212}{-}
\begin{document}
\title{Status of leptoquark models after LHC Run-2 and discovery prospects at future colliders}
\author{Nishita Desai} \email{nishita.desai@tifr.res.in}
\affiliation{Department of Theoretical Physics, \\ Tata Institute of Fundamental Research, \\  Mumbai, India 400005 }

\author{Amartya Sengupta}
\affiliation{Meghnad Saha Pally, Burdwan, India, 713104 }

\email{amartya.sengupta@studenti.unipd.it}

\begin{abstract}
We study limits from dilepton searches on leptoquark completions to the Standard Model in the parameter space motivated by anomalies in the $b \rightarrow s$ sector.  After a full Run-2 analysis by LHCb, the disparity in lepton flavour violation has disappeared. However, the mismatch in angular distributions as well as in $B_s \rightarrow \mu^+ \mu^-$ partial width is sti

In [4]:
import re

def load_tex_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        corpus = file.readlines()
    return [line.strip() for line in corpus]

def process_line(line):
    # 수식 부분을 제외하고 나머지 텍스트를 소문자로 변환
    def lower_except_math(text):
        if text.startswith('$') or text.startswith('\\[') or text.startswith('\\('):
            return text
        else:
            return text.lower()

    # 수식 부분을 제외한 텍스트를 찾기 위한 정규식
    pattern = re.compile(r'(\$.*?\$|\\\[.*?\\\]|\\\(.*?\\\))')
    parts = pattern.split(line)
    processed_parts = [lower_except_math(part) for part in parts]
    return ''.join(processed_parts)

def process_corpus(corpus):
    return [process_line(line) for line in corpus]

# 사용 예시
filepath = '2301.01754.tex'
corpus = load_tex_file(filepath)
processed_corpus = process_corpus(corpus)
for line in processed_corpus:
    print(line)


\documentclass[12pt]{revtex4-2}
%
\usepackage[utf8]{inputenc}







\numberwithin{equation}{section}



\newcommand{\ytt}{y_{22}}
\newcommand{\yth}{y_{32}}
\declareunicodecharacter{2212}{-}
\begin{document}
\title{status of leptoquark models after lhc run-2 and discovery prospects at future colliders}
\author{nishita desai} \email{nishita.desai@tifr.res.in}
\affiliation{department of theoretical physics, \\ tata institute of fundamental research, \\  mumbai, india 400005 }

\author{amartya sengupta}
\affiliation{meghnad saha pally, burdwan, india, 713104 }

\email{amartya.sengupta@studenti.unipd.it}

\begin{abstract}
we study limits from dilepton searches on leptoquark completions to the standard model in the parameter space motivated by anomalies in the $b \rightarrow s$ sector.  after a full run-2 analysis by lhcb, the disparity in lepton flavour violation has disappeared. however, the mismatch in angular distributions as well as in $B_s \rightarrow \mu^+ \mu^-$ partial width is sti

In [5]:
corpus = processed_corpus

In [6]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

In [7]:
alphabet = []

for word in word_freqs.keys():
  for letter in word:
    if letter not in alphabet:
      alphabet.append(letter)
alphabet.sort()


In [8]:
vocab = ["<|endoftext|>"] + alphabet.copy()

In [9]:
vocab

['<|endoftext|>',
 '!',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'G',
 'K',
 'L',
 'M',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'Y',
 'Z',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 'Ġ']

In [10]:
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [11]:
print(splits)

{'\\': ['\\'], 'documentclass': ['d', 'o', 'c', 'u', 'm', 'e', 'n', 't', 'c', 'l', 'a', 's', 's'], '[': ['['], '12': ['1', '2'], 'pt': ['p', 't'], ']{': [']', '{'], 'revtex': ['r', 'e', 'v', 't', 'e', 'x'], '4': ['4'], '-': ['-'], '2': ['2'], '}': ['}'], '%': ['%'], 'usepackage': ['u', 's', 'e', 'p', 'a', 'c', 'k', 'a', 'g', 'e'], 'utf': ['u', 't', 'f'], '8': ['8'], 'inputenc': ['i', 'n', 'p', 'u', 't', 'e', 'n', 'c'], 'numberwithin': ['n', 'u', 'm', 'b', 'e', 'r', 'w', 'i', 't', 'h', 'i', 'n'], '{': ['{'], 'equation': ['e', 'q', 'u', 'a', 't', 'i', 'o', 'n'], '}{': ['}', '{'], 'section': ['s', 'e', 'c', 't', 'i', 'o', 'n'], 'newcommand': ['n', 'e', 'w', 'c', 'o', 'm', 'm', 'a', 'n', 'd'], '{\\': ['{', '\\'], 'ytt': ['y', 't', 't'], 'y': ['y'], '_{': ['_', '{'], '22': ['2', '2'], '}}': ['}', '}'], 'yth': ['y', 't', 'h'], '32': ['3', '2'], 'declareunicodecharacter': ['d', 'e', 'c', 'l', 'a', 'r', 'e', 'u', 'n', 'i', 'c', 'o', 'd', 'e', 'c', 'h', 'a', 'r', 'a', 'c', 't', 'e', 'r'], '2212

In [12]:
def compute_pair_freqs(splits):
  pair_freqs = defaultdict(int)
  for word, freq in word_freqs.items():
    split = splits[word]
    if len(split) ==1:
      continue
    for i in range(len(split) - 1):
      pair = (split[i], split[i+1])
      pair_freqs[pair] += freq
  return pair_freqs


pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
  if i<5:
    print(key, pair_freqs[key])
  else:
    break


('d', 'o') 25
('o', 'c') 13
('c', 'u') 47
('u', 'm') 42
('m', 'e') 88


In [13]:
best_pair = ''
max_freq = None
for pair, freq in pair_freqs.items():
  if max_freq is None or max_freq < freq:
    best_pair = pair
    max_freq = freq

In [14]:
print(best_pair, max_freq)

('Ġ', 't') 684


In [15]:
def merge_pair(a, b, splits):
  for word in word_freqs:
    split = splits[word]
    if len(split) == 1:
      continue

    i=0
    while i < len(split) - 1:
      if split[i] == a and split[i+1] == b:
        split = split[:i] + [a+b] + split[i+2:]
      else:
        i += 1
    splits[word] = split
  return splits

splits = merge_pair('Ġ', 't', splits)

In [16]:
vocab_size = 1000
merges = {}
while len(vocab) < vocab_size:
  pair_freqs = compute_pair_freqs(splits)
  best_pair = ''
  max_freq = None
  for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
      best_pair = pair
      max_freq = freq

  splits = merge_pair(*best_pair, splits)
  merges[best_pair] = best_pair[0] + best_pair[1]
  vocab.append(merges[best_pair])

In [17]:
def tokenize(text):
  pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text.lower())
  pre_tokenized_text = [word for word, offset in pre_tokenize_result]
  splits = [[l for l in word] for word in pre_tokenized_text]
  for pair, merge in merges.items():
    for idx, split in enumerate(splits):
      i = 0
      while i < len(split) - 1:
        if split[i] == pair[0] and split[i+1] == pair[1]:
          split = split[:i] + [merge] + split[i+2:]
        else:
          i += 1
      splits[idx] = split

  return sum(splits, [])

In [18]:
print(tokenize(r"x = \frac{{-b \pm \sqrt{{b^2 - 4ac}}}}{{2a}}"))

['x', 'Ġ=', 'Ġ\\', 'frac', '{', '{', '-', 'b', 'Ġ\\', 'pm', 'Ġ\\', 'sqrt', '{', '{', 'b', '^', '2', 'Ġ-', 'Ġ', '4', 'a', 'c', '}}', '}}{', '{', '2', 'a', '}}']


In [19]:
from transformers import GPT2Tokenizer

# GPT-2 토크나이저 불러오기
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# 수식 정의
equation = r"x = \frac{{-b \pm \sqrt{{b^2 - 4ac}}}}{{2a}}"

# 수식을 토큰화
tokens = tokenizer.tokenize(equation)

# 토큰 목록 출력
print(tokens)


['x', 'Ġ=', 'Ġ\\', 'frac', '{{', '-', 'b', 'Ġ\\', 'pm', 'Ġ\\', 'sq', 'rt', '{{', 'b', '^', '2', 'Ġ-', 'Ġ4', 'ac', '}}', '}}', '{{', '2', 'a', '}}']


In [20]:
from transformers import BertTokenizer

# BERT 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 수식 정의
equation = r"x = \frac{{-b \pm \sqrt{{b^2 - 4ac}}}}{{2a}}"

# 수식을 토큰화
tokens = tokenizer.tokenize(equation)

# 토큰 목록 출력
print(tokens)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


['x', '=', '\\', 'fra', '##c', '{', '{', '-', 'b', '\\', 'pm', '\\', 'sq', '##rt', '{', '{', 'b', '^', '2', '-', '4a', '##c', '}', '}', '}', '}', '{', '{', '2a', '}', '}']


In [21]:
from transformers import T5Tokenizer

# T5 토크나이저 불러오기
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# 수식 정의
equation = r"x = \frac{{-b \pm \sqrt{{b^2 - 4ac}}}}{{2a}}"

# 수식을 토큰화
tokens = tokenizer.tokenize(equation)

# 토큰 목록 출력
print(tokens)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['▁', 'x', '▁=', '▁', '\\', 'frac', '{{', '-', 'b', '▁', '\\', 'pm', '▁', '\\', 's', 'q', 'r', 't', '{{', 'b', '^', '2', '▁', '-', '▁4', 'a', 'c', '}}}}{{', '2', 'a', '}}']
