In [3]:
import json
import torch
from transformers import GPT2TokenizerFast, GPT2Model

# GPU 사용 설정 (가능한 경우)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 커스텀 토크나이저 로드
tokenizer_path = '.'  # tokenizer.json 파일이 있는 디렉토리 경로
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path)

# GPT2 모델 로드 (임베딩 레이어만 사용)
gpt2_model = GPT2Model.from_pretrained('gpt2')
embedding_layer = gpt2_model.wte.to(device)

# 디버깅을 위한 정보 출력
print(f"Tokenizer vocabulary size: {len(tokenizer.get_vocab())}")
print(f"GPT2 embedding size: {gpt2_model.wte.num_embeddings}")

# 새로운 임베딩 로드
new_embeddings_path = 'new_embeddings.pth'  # new_embeddings.pth 파일 경로

# 새로운 임베딩의 크기 계산
new_vocab_size = len(tokenizer.get_vocab())
gpt2_vocab_size = gpt2_model.wte.num_embeddings
additional_tokens = max(0, new_vocab_size - gpt2_vocab_size)

print(f"Additional tokens: {additional_tokens}")

new_embeddings = torch.nn.Embedding(additional_tokens, gpt2_model.wte.embedding_dim)
new_embeddings.load_state_dict(torch.load(new_embeddings_path))
new_embeddings = new_embeddings.to(device)

# 문장 임베딩 함수
def get_sentence_embedding(sentence):
    tokens = tokenizer.encode(sentence, truncation=True, max_length=512)
    token_embeddings = torch.cat([embedding_layer.weight, new_embeddings.weight])[tokens]
    return token_embeddings.mean(dim=0)

# 코사인 유사도 계산 함수
def cosine_similarity(emb1, emb2):
    return torch.cosine_similarity(emb1, emb2, dim=0).item()


Using device: cuda
Tokenizer vocabulary size: 30001
GPT2 embedding size: 50257
Additional tokens: 0
Cosine similarity between the sentences: 0.7998372912406921
Similarity between 'The quick brown fox jumps over the lazy dog.' and 'A fast auburn canine leaps above the indolent hound.': 0.8345
Similarity between 'The quick brown fox jumps over the lazy dog.' and 'Python is a popular programming language.': 0.7662
Similarity between 'The quick brown fox jumps over the lazy dog.' and 'Java is widely used in enterprise software development.': 0.8026
Similarity between 'A fast auburn canine leaps above the indolent hound.' and 'Python is a popular programming language.': 0.7927
Similarity between 'A fast auburn canine leaps above the indolent hound.' and 'Java is widely used in enterprise software development.': 0.8257
Similarity between 'Python is a popular programming language.' and 'Java is widely used in enterprise software development.': 0.8039


In [9]:

# 사용 예시
sentence1 = "2/3"
sentence2 = "\frac{2}{3}"

emb1 = get_sentence_embedding(sentence1)
emb2 = get_sentence_embedding(sentence2)

similarity = cosine_similarity(emb1, emb2)
print(f"Cosine similarity between the sentences: {similarity}")


Cosine similarity between the sentences: 0.667129397392273


In [11]:
import json
import torch
from transformers import GPT2TokenizerFast, GPT2Model

# GPU 사용 설정 (가능한 경우)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 커스텀 토크나이저 로드
tokenizer_path = '.'  # tokenizer.json 파일이 있는 디렉토리 경로
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path)

# GPT2 모델 로드 (임베딩 레이어와 positional 임베딩 사용)
gpt2_model = GPT2Model.from_pretrained('gpt2')
embedding_layer = gpt2_model.wte.to(device)
positional_embedding = gpt2_model.wpe.to(device)

# 디버깅을 위한 정보 출력
print(f"Tokenizer vocabulary size: {len(tokenizer.get_vocab())}")
print(f"GPT2 embedding size: {gpt2_model.wte.num_embeddings}")

# 새로운 임베딩 로드
new_embeddings_path = 'new_embeddings.pth'  # new_embeddings.pth 파일 경로

# 새로운 임베딩의 크기 계산
new_vocab_size = len(tokenizer.get_vocab())
gpt2_vocab_size = gpt2_model.wte.num_embeddings
additional_tokens = max(0, new_vocab_size - gpt2_vocab_size)

print(f"Additional tokens: {additional_tokens}")

new_embeddings = torch.nn.Embedding(additional_tokens, gpt2_model.wte.embedding_dim)
new_embeddings.load_state_dict(torch.load(new_embeddings_path))
new_embeddings = new_embeddings.to(device)

# 문장 임베딩 함수 (positional embedding 포함)
def get_sentence_embedding(sentence):
    tokens = tokenizer.encode(sentence, truncation=True, max_length=512)
    token_ids = torch.tensor(tokens).unsqueeze(0).to(device)  # [1, seq_len]
    
    # 토큰 임베딩
    token_embeddings = torch.cat([embedding_layer.weight, new_embeddings.weight])[token_ids]  # [1, seq_len, embed_dim]
    
    # Positional 임베딩
    position_ids = torch.arange(0, token_ids.size(1)).unsqueeze(0).to(device)  # [1, seq_len]
    pos_embeddings = positional_embedding(position_ids)  # [1, seq_len, embed_dim]
    
    # 토큰 임베딩과 positional 임베딩 결합
    combined_embeddings = token_embeddings + pos_embeddings  # [1, seq_len, embed_dim]
    
    # 평균 계산
    sentence_embedding = combined_embeddings.mean(dim=1).squeeze(0)  # [embed_dim]
    
    return sentence_embedding

# 코사인 유사도 계산 함수
def cosine_similarity(emb1, emb2):
    return torch.cosine_similarity(emb1, emb2, dim=0).item()


Using device: cuda
Tokenizer vocabulary size: 30001
GPT2 embedding size: 50257
Additional tokens: 0


In [23]:

# 사용 예시
sentence1 = r"F(x)={\sqrt{\frac{10x^{-8}}{-8}}}+C_{1} =-{\frac{5}{4x^{8}}}+C_{1}\quad{\mathrm {if~}}x<0."
sentence2 = r"F(x)=\sqrt{10}x^{-8/-8}+C_{1}=-\frac{5} {4}x^{8}+C_{1}\quad\mathrm{if~}x<0."


emb1 = get_sentence_embedding(sentence1)
emb2 = get_sentence_embedding(sentence2)

similarity = cosine_similarity(emb1, emb2)
print(f"Cosine similarity between the sentences: {similarity}")


Cosine similarity between the sentences: 0.9905670285224915


In [28]:
emb2.shape

torch.Size([768])

In [66]:
import torch
import numpy as np
from transformers import GPT2TokenizerFast, GPT2Model
import re


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 토크나이저 및 모델 로드
tokenizer = GPT2TokenizerFast.from_pretrained('.')
gpt2_model = GPT2Model.from_pretrained('gpt2')
embedding_layer = gpt2_model.wte.to(device)
positional_embedding = gpt2_model.wpe.to(device)

# 새로운 임베딩 로드 (필요한 경우)

new_embeddings_state = torch.load('new_embeddings.pth')
new_vocab_size, embedding_dim = new_embeddings_state['weight'].shape
new_embeddings = torch.nn.Embedding(new_vocab_size, embedding_dim).to(device)
new_embeddings.load_state_dict(new_embeddings_state)
print(f"Loaded new embeddings with shape: {new_embeddings.weight.shape}")

def spacing(text):
    return re.sub(r'(?<!\s)\\', r' \\', text)

def get_token_embeddings(sentence):
    sentence = spacing(sentence)
    tokens = tokenizer.encode(sentence, truncation=True, max_length=512)
    print(f"Tokenized text: {tokens}")
    decoded_tokens = [tokenizer.decode([token]) for token in tokens]
    print(f"Decoded tokens: {decoded_tokens}")
    
    token_ids = torch.tensor(tokens).unsqueeze(0).to(device)
    positions = torch.arange(0, token_ids.size(1)).unsqueeze(0).to(device)
    
    if new_embeddings is not None:
        token_embeddings = torch.cat([embedding_layer.weight, new_embeddings.weight])[token_ids]
    else:
        token_embeddings = embedding_layer(token_ids)
    
    pos_embeddings = positional_embedding(positions)
    
    return list(zip(token_embeddings[0], pos_embeddings[0]))

def cosine_distance(emb1, emb2):
    return 1 - torch.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()

def token_distance(token1, token2, w_emb=0.7, w_pos=0.3):
    emb1, pos1 = token1
    emb2, pos2 = token2
    emb_dist = cosine_distance(emb1, emb2)
    pos_dist = torch.abs(pos1 - pos2).float().mean().item()
    return w_emb * emb_dist + w_pos * pos_dist

def n_gram_similarity(ref_tokens, pred_tokens, n, max_d=2.0):
    ref_ngrams = [ref_tokens[i:i+n] for i in range(len(ref_tokens)-n+1)]
    pred_ngrams = [pred_tokens[i:i+n] for i in range(len(pred_tokens)-n+1)]
    
    L_n = min(len(ref_ngrams), len(pred_ngrams))
    if L_n == 0:
        return 0
    
    total_distance = sum(
        sum(token_distance(ref_token, pred_token) 
            for ref_token, pred_token in zip(ref_ngram, pred_ngram))
        for ref_ngram, pred_ngram in zip(ref_ngrams[:L_n], pred_ngrams[:L_n])
    )
    
    return 1 - (total_distance / (L_n * n * max_d))

def texbleu(reference, prediction, max_n=4, weights=None):
    if weights is None:
        weights = [1/max_n] * max_n
    
    ref_tokens = get_token_embeddings(reference)
    pred_tokens = get_token_embeddings(prediction)
    
    n_gram_scores = [n_gram_similarity(ref_tokens, pred_tokens, n) 
                     for n in range(1, max_n+1)]
    
    return np.exp(sum(w * np.log(max(s, 1e-10)) 
                      for w, s in zip(weights, n_gram_scores)))


Using device: cuda
Loaded new embeddings with shape: torch.Size([0, 768])


In [68]:
sentence1 = r"F(x)={\sqrt{\frac{10x^{-8}}{-8}}}+C_{1} =-{\frac{5}{4x^{8}}}+C_{1}\quad{\mathrm {if~}}x<0."

In [69]:
spacing(sentence1)

'F(x)={ \\sqrt{ \\frac{10x^{-8}}{-8}}}+C_{1} =-{ \\frac{5}{4x^{8}}}+C_{1} \\quad{ \\mathrm {if~}}x<0.'

In [70]:

score1 = bleu_like_score(sentence1, sentence2)

print(f"Score between reference and prediction1: {score1:.4f}")


Tokenized text: [65, 35, 115, 8680, 118, 87, 8364, 118, 87, 7911, 118, 8006, 115, 8193, 51, 7842, 8511, 51, 8461, 38, 8870, 118, 44, 120, 10796, 118, 87, 7911, 118, 48, 7869, 22121, 7862, 51, 8461, 38, 8870, 118, 44, 120, 87, 8318, 118, 87, 8195, 118, 8069, 121, 7842, 115, 55, 43, 41]
Decoded tokens: ['F', '(', 'x', ')=', '{', '\\', 'sqrt', '{', '\\', 'frac', '{', '10', 'x', '^{-', '8', '}}', '{-', '8', '}}}', '+', 'C_', '{', '1', '}', '=-', '{', '\\', 'frac', '{', '5', '}{', '4x', '^{', '8', '}}}', '+', 'C_', '{', '1', '}', '\\', 'quad', '{', '\\', 'mathrm', '{', 'if', '~', '}}', 'x', '<', '0', '.']
Tokenized text: [65, 35, 115, 8680, 87, 8364, 118, 8006, 120, 115, 8193, 51, 24915, 51, 9109, 8870, 118, 44, 16802, 87, 7911, 118, 48, 120, 118, 47, 120, 115, 7862, 51, 9109, 8870, 118, 44, 120, 87, 8318, 87, 8195, 118, 8069, 22545, 115, 55, 43, 41]
Decoded tokens: ['F', '(', 'x', ')=', '\\', 'sqrt', '{', '10', '}', 'x', '^{-', '8', '/-', '8', '}+', 'C_', '{', '1', '}=-', '\\', 'frac', '{'