In [None]:
from transformers import pipeline, BertModel, BertTokenizer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from scipy.spatial.distance import cosine
import torch
import os


In [None]:
generator = pipeline("text-generation", model ="gpt2")

In [None]:
result = generator("I have a cat", max_length = 100, num_return_sequences = 1)

In [None]:
result

In [None]:
##############################################################################################################

In [None]:
sentiment_analysis = pipeline("sentiment-analysis",model="roberta-base")

In [None]:
result = sentiment_analysis("I hate you")

In [None]:
result

In [None]:
##############################################################################################################

In [None]:
sentences = {
    "This quickl brown fox jumps over the lazy dog",
    "I love playing with my pet dog",
    "The dog barks at the stranger",
    "The cat slpeeps on the sofa"
}

In [None]:
processed = [simple_preprocess(sentence)for sentence in sentences]

processed

In [None]:
model = Word2Vec(sentences = processed, vector_size=5, window = 5, min_count=1)

In [None]:
dog = model.wv["dog"]
cat = model.wv["cat"]

In [None]:
sim = 1-cosine(dog,cat)

In [None]:
sim

In [None]:
##############################################################################################################

In [None]:
model_name = "bert-base-uncased"

In [None]:
sentences = list(sentences)

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [None]:
sentences2 ={
    "A fast brown fox leaps over a sleepy dog"
}

sentences2 = list(sentences2)

In [None]:
input1 = tokenizer(sentences[0], return_tensors='pt')
input2 = tokenizer(sentences2[0], return_tensors='pt')

In [None]:
with torch.no_grad():
    output1 = model(**input1)
    output2 = model(**input2)

In [None]:
embedding1 = output1.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
embedding2 = output2.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

In [None]:
similarity = 1 - cosine(embedding1,embedding2)

print(similarity)

In [None]:
##############################################################################################################

In [None]:
from transformers import M2M100ForConditionalGeneration,M2M100Tokenizer

In [None]:
# 모델 과 토크나이저 불러오기
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# 문장 토큰화하기
encoded_sentence = tokenizer(sentences2[0],return_tensors='pt')

In [None]:
# 현재 언어와 번역 언어 지정하기
tokenizer.src_lang = "en"
model.config.forced_bos_token_id = tokenizer.get_lang_id("ko")

In [None]:
# 번역 수행하기
generated_tokens = model.generate(**encoded_sentence)

In [None]:
#번역 결과 디코드 하기
translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

In [None]:
print(translated_text)

In [None]:
##############################################################################################################

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
# 위 모델보다 성능이 좋다고 알려짐
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# 입력값 토큰화
inputs = tokenizer(sentences2[0],return_tensors='pt')

In [None]:
# 토큰의 언어를 한글로 설정하고 모델을 돌림.
generated_tokens = model.generate(inputs.input_ids, forced_bos_token_id=tokenizer.convert_tokens_to_ids("kor_Hang"))

In [None]:
# 디코딩
translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

In [None]:
translated_text