In [38]:
from transformers import pipeline, BertModel, BertTokenizer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from scipy.spatial.distance import cosine
import torch

In [2]:
generator = pipeline("text-generation", model ="gpt2")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [3]:
result = generator("I have a cat", max_length = 100, num_return_sequences = 1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [4]:
result

[{'generated_text': 'I have a cat named Kitten. I am a big fan of cat feline. I own three. We are having a cat named Kitten. I am a big fan of cat feline. I own three. We are having a cat named Kitten. I am a big fan of cat feline. I own three. "Hello Kitty" My cat Kitten was born to have her paw painted with my nameplate. Her kitten was just named "Kitten." "Hello Kitty'}]

In [None]:
##############################################################################################################

In [10]:
sentiment_analysis = pipeline("sentiment-analysis",model="roberta-base")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [11]:
result = sentiment_analysis("I hate you")

In [12]:
result

[{'label': 'LABEL_0', 'score': 0.5050202012062073}]

In [None]:
##############################################################################################################

In [13]:
sentences = {
    "This quickl brown fox jumps over the lazy dog",
    "I love playing with my pet dog",
    "The dog barks at the stranger",
    "The cat slpeeps on the sofa"
}

In [14]:
processed = [simple_preprocess(sentence)for sentence in sentences]

processed

In [17]:
model = Word2Vec(sentences = processed, vector_size=5, window = 5, min_count=1)

In [19]:
dog = model.wv["dog"]
cat = model.wv["cat"]

In [20]:
sim = 1-cosine(dog,cat)

In [21]:
sim

0.61785641367594

In [None]:
##############################################################################################################

In [40]:
model_name = "bert-base-uncased"

In [47]:
sentences = list(sentences)

In [48]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [1]:
sentences2 ={
    "A fast brown fox leaps over a sleepy dog"
}

sentences2 = list(sentences2)

In [56]:
input1 = tokenizer(sentences[0], return_tensors='pt')
input2 = tokenizer(sentences2[0], return_tensors='pt')

In [57]:
with torch.no_grad():
    output1 = model(**input1)
    output2 = model(**input2)

In [58]:
embedding1 = output1.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
embedding2 = output2.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

In [59]:
similarity = 1 - cosine(embedding1,embedding2)

print(similarity)

0.7246558113358225


In [None]:
##############################################################################################################

In [60]:
from transformers import M2M100ForConditionalGeneration,M2M100Tokenizer

In [61]:
# 모델 과 토크나이저 불러오기
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
# 문장 토큰화하기
encoded_sentence = tokenizer(sentences2[0],return_tensors='pt')

In [None]:
# 현재 언어와 번역 언어 지정하기
tokenizer.src_lang = "en"
model.config.forced_bos_token_id = tokenizer.get_lang_id("ko")

In [64]:
# 번역 수행하기
generated_tokens = model.generate(**encoded_sentence)



In [None]:
#번역 결과 디코드 하기
translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

In [66]:
print(translated_text)

빠른 갈색 엉덩이가 잠자는 개 위로 뛰어들고 있다.


In [None]:
##############################################################################################################

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 위 모델보다 성능이 좋다고 알려짐
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [None]:
# 입력값 토큰화
inputs = tokenizer(sentences2[0],return_tensors='pt')

In [None]:
# 토큰의 언어를 한글로 설정하고 모델을 돌림.
generated_tokens = model.generate(inputs.input_ids, forced_bos_token_id=tokenizer.convert_tokens_to_ids("kor_Hang"))

In [None]:
# 디코딩
translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

In [20]:
translated_text

'빠른 갈색 여우가 잠든 개를 뛰어넘는다'