## LLM(거대언어모델)

#### NLP(자연어처리)

In [None]:
from konlpy.tag import Okt
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [2]:
# 1. 텍스트 데이터 (입력 문장)
sentences = [
    "자연어 처리는 재미있는 분야입니다.",
    "딥러닝은 많은 데이터를 필요로 합니다.",
    "한국어 NLP는 정말 재미있어요!"
]

In [None]:
# 2. 토크나이징 (Tokenizing)
okt = Okt()
tokenized_sentences = [okt.morphs(sentence) for sentence in sentences]
print("토크나이징 결과:", tokenized_sentences)

In [None]:
# 3. 인코딩 (Encoding): 단어를 숫자로 변환
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_sentences)
encoded_sentences = tokenizer.texts_to_sequences(tokenized_sentences)
print("인코딩 결과:", encoded_sentences)

In [None]:
# 4. 패딩 (Padding): 길이를 맞추기 위해 0으로 채우기
max_len = 10 # 최대 길이 설정
padded_sentences = pad_sequences(encoded_sentences, maxlen=max_len, padding='post')
print("패딩 결과:", padded_sentences)

In [None]:
# 5. 임베딩 (Embedding)
vocab_size = len(tokenizer.word_index) + 1 # 단어 사전 크기
embedding_dim = 8 # 임베딩 차원 크기

In [None]:
# 간단한 임베딩 모델 생성
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
model.compile('rmsprop', 'mse')

In [None]:
# 패딩된 문장을 임베딩 층에 통과
embeddings = model.predict(padded_sentences)
print("임베딩 결과 (첫 번째 문장):\n"), embeddings[0]

## 트랜스포머(Transformer)

#### Hugging Face를 사용한 BERT 테스트

In [None]:
from transformers import pipeline

In [None]:
classifier=pipeline("sentiment-analysis")
classifier("오늘 기분이 좋아요")

In [None]:
classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)

In [None]:
# 텍스트 생성(text generation)
generator = pipeline("text-generation")
generator("In this course, we will teach you how to")

In [None]:
# question-answering
question_answerer = pipeline("question-answering")
question_answerer(
    question="Where do I work?",
    context="My name is Sylvain and I work at Hugging Face in Brooklyn",
)

In [15]:
# 요약(Summarization)
summarizer = pipeline("summarization")
summarizer(
    """
    America has changed dramatically during recent years. Not only has the number of
    graduates in traditional engineering disciplines such as mechanical, civil,
    electrical, chemical, and aeronautical engineering declined, but in most of
    the premier American universities engineering curricula now concentrate on
    and encourage largely the study of engineering science. As a result, there
    are declining offerings in engineering subjects dealing with infrastructure,
    the environment, and related issues, and greater concentration on high
    technology subjects, largely supporting increasingly complex scientific
    developments. While the latter is important, it should not be at the expense
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other
    industrial countries in Europe and Asia, continue to encourage and advance
    the teaching of engineering. Both China and India, respectively, graduate
    six and eight times as many traditional engineers as does the United States.
    Other industrial countries at minimum maintain their output, while America
    suffers an increasingly serious decline in the number of engineering graduates
    and a lack of well-educated engineers.
    """
)

: 

In [None]:
feature_extractor = pipeline("feature-extraction")
text = "Transformers are amazing for natural language processing tasks!"
features = feature_extractor(text)
print(features[0][0])

In [None]:
from transformers import pipeline

# Fill-Mask Pipeline
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

# 입력 문장
text = "Transformers are [MASK] for natural language processing tasks."

# 마스크 채우기
predictions = fill_mask(text)

# 출력: 예측 결과 확인
for prediction in predictions:
    print(f"Option: {prediction['token_str']}, Score: {prediction['score']:.4f}")


In [None]:
from transformers import pipeline

# Named Entity Recognition Pipeline
ner = pipeline("ner", grouped_entities=True)

# 입력 문장
text = "Hugging Face is based in Brooklyn, New York, and was founded by Sylvain."

# 개체명 인식
entities = ner(text)

# 출력: 인식된 개체명 확인
for entity in entities:
    print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Score: {entity['score']:.4f}")


#### langchain 예제

In [1]:
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.embeddings import HuggingFaceEmbeddings

In [2]:
vectorstore = FAISS.from_texts(
    [
        "영준은 랭체인 주식회사에서 근무를 하였습니다.",
        "설현은 테디와 같은 회사에서 근무하였습니다.",
        "영준의 직업은 개발자입니다.",
        "설현의 직업은 디자이너입니다.",
    ],
    embedding = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask'),
)

  embedding = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask'),





modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/744 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

: 

In [None]:
retriever = vectorstore.as_retriever()

In [None]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""