In [30]:
# 사전 설치 : pip install konlpy
from konlpy.tag import Okt
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [31]:
# 1. 텍스트 데이터 (입력 문장)
sentences = [
    "자연어 처리는 재미있는 분야입니다.",
    "딥러닝은 많은 데이터를 필요로 합니다.",
    "한국어 NLP는 정말 재미있어요!"
]

In [33]:
# 2. 토크나이징 (Tokenizing)
okt = Okt()
tokenized_sentences = [okt.morphs(sentence) for sentence in sentences]
print("토크나이징 결과 : ", tokenized_sentences)

토크나이징 결과 :  [['자연어', '처리', '는', '재미있는', '분야', '입니다', '.'], ['딥', '러닝', '은', '많은', '데이터', '를', '필요', '로', '합니다', '.'], ['한국어', 'NLP', '는', '정말', '재미있어요', '!']]


In [4]:
# 3. 인코딩 (Encoding) : 단어를 숫자로 변환
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_sentences)
encoded_sentences = tokenizer.texts_to_sequences(tokenized_sentences)
print("인코딩 결과 : ", encoded_sentences)

인코딩 결과 :  [[3, 4, 1, 5, 6, 7, 2], [8, 9, 10, 11, 12, 13, 14, 15, 16, 2], [17, 18, 1, 19, 20, 21]]


In [5]:
# 4. 패딩 (Padding) : 길이를 맞추기 위해 0으로 채우기
max_len = 10    # 최대 길이 설정
padded_sentences = pad_sequences(encoded_sentences, maxlen = max_len, padding = 'post')
print("패딩 결과 : ", padded_sentences)

패딩 결과 :  [[ 3  4  1  5  6  7  2  0  0  0]
 [ 8  9 10 11 12 13 14 15 16  2]
 [17 18  1 19 20 21  0  0  0  0]]


In [6]:
# 5. 임베딩 (Embedding)
vocab_size = len(tokenizer.word_index) + 1  # 단어 사전 크기
Embedding_dim = 8   # 임베딩 차원 크기

In [7]:
# 간단한 임베딩 모델 생성
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = Embedding_dim, input_length = max_len))
model.compile('rmsprop', 'mse')



In [39]:
# 패딩된 문장을 임베딩 층에 통과
embeddings = model.predict(padded_sentences)
print("임베딩 결과 (첫 번째 문장) : \n", embeddings[0][2], "임베딩 결과 (두 번째 문장) : \n", embeddings[1][2])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
임베딩 결과 (첫 번째 문장) : 
 [ 0.03999898 -0.04633672  0.00698692  0.0033647  -0.00718323  0.00613239
 -0.04878513 -0.03539188] 임베딩 결과 (두 번째 문장) : 
 [ 0.03949689 -0.01627075 -0.02212104 -0.0348159   0.03047461 -0.00944414
 -0.00941789 -0.0404264 ]


In [15]:
# 감정분석 (zero-shot classification)
# transformers 라이브러리 사전 설치
# tf-keras 라이브러리 사전 설치
from transformers import pipeline

In [14]:
classifier = pipeline("sentiment-analysis")
classifier("오늘 기분이 좋아요")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Device set to use 0


[{'label': 'POSITIVE', 'score': 0.8848785758018494}]

In [13]:
classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)

[{'label': 'POSITIVE', 'score': 0.9598047137260437},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [17]:
# 텍스트 생성 (text generation)
generator = pipeline("text-generation")
generator("In this course, we will teach you how to")

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Device set to use 0


[{'generated_text': 'In this course, we will teach you how to perform the key performance tests that will provide you the best results. This will improve your confidence and focus in the future. The test is free.\n\nCourse notes\n\nPlease consider submitting our course'}]

In [19]:
# question-answering
question_answerer = pipeline("question-answering")
question_answerer(
    question = "Where do I work?",
    context = "My name is Sylvain and I work at Hugging Face in Brooklyn"
)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.
Device set to use 0


{'score': 0.6949762105941772, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}

In [23]:
# 요약(Summarization)
summarizer = pipeline("summarization")
summarizer(
    """
    America has changed dramatically during recent years. Not only has the number of
    graduates in traditional engineering disciplines such as mechanical, civil,
    electrical, chemical, and aeronautical engineering declined, but in most of
    the premier American universities engineering curricula now concentrate on
    and encourage largely the study of engineering science. As a result, there
    are declining offerings in engineering subjects dealing with infrastructure,
    the environment, and related issues, and greater concentration on high
    technology subjects, largely supporting increasingly complex scientific
    developments. While the latter is important, it should not be at the expense
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other
    industrial countries in Europe and Asia, continue to encourage and advance
    the teaching of engineering. Both China and India, respectively, graduate
    six and eight times as many traditional engineers as does the United States.
    Other industrial countries at minimum maintain their output, while America
    suffers an increasingly serious decline in the number of engineering graduates
    and a lack of well-educated engineers.
"""
)

No model was supplied, defaulted to google-t5/t5-small and revision df1b051 (https://huggingface.co/google-t5/t5-small).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Device set to use 0


[{'summary_text': 'the number of graduates in traditional engineering disciplines has declined . in most of the premier american universities engineering curricula now concentrate on and encourage largely the study of engineering science . rapidly developing economies such as China and India continue to encourage and advance the teaching of engineering .'}]

In [40]:
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# 예시 벡터
vector1 = [ 0.03999898, -0.04633672,  0.00698692,  0.0033647,  -0.00718323,  0.00613239, -0.04878513, -0.03539188 ]
vector2 = [ 0.03949689, -0.01627075, -0.02212104, -0.0348159,   0.03047461, -0.00944414, -0.00941789, -0.0404264 ]

similarity = cosine_similarity(vector1, vector2)
print(f"유사도: {similarity}")

유사도: 0.5350714735262023


In [1]:
# 사전설치 : pip install faiss-cpu
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.embeddings import HuggingFaceEmbeddings

In [3]:
vectorstore = FAISS.from_texts(
    [
        "영준은 랭체인 주식회사에서 근무를 하였습니다.",
        "설현은 테디와 같은 회사에서 근무하였습니다.",
        "영준의 직업은 개발자입니다.",
        "설현의 직업은 디자이너입니다.",
    ],
    embedding = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask'),
)

In [5]:
retriever = vectorstore.as_retriever()  # vectorstore 에 저장한 retriever 가져오기

In [6]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

In [7]:
from langchain_community.llms import Ollama