### 08. 임베딩 : 고급편

### 학습 내용
 * 선호하는 커피 예측하기
 * 퍼지 검색하기
 * 임베딩으로 뉴스 카테고리 예측하기
 * 제로샷(Zero-Shot) 분류기의 정확도 평가하기

### 사전 준비
 * 구글 코랩 환경은 일정 시간이후에 초기화가 되기 때문에 두가지 작업을 매번 수행해야 함.
   * chatgpt.env 파일 생성이 필요.
     * 준비된 chatgpt.env를 내용을 변경하여 업로드 하거나 또는 API_KEY와 ORG_ID를 확인하여 생성한다.
   * pip install openai 설치
   * 캐글 데이터 셋 다운로드 후, 업로드
     * https://www.kaggle.com/datasets/schmoyote/coffee-reviews-dataset?select=simplified_coffee.csv
   * data폴더에 데이터 셋 업로드 - data/News_Category_Dataset_v3.json
   * 자연어 처리 라이브러리 nltk 설치 및 다운로드
     * pip install nltk
     * nltk.download('stopwords')
     * nltk.download('punkt')

In [1]:
!pip install openai
!pip install nltk

Collecting openai
  Downloading openai-0.27.10-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m71.7/76.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.10


In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### 선호하는 커피 예측하기

#### 세개의 함수 만들기
 * init_api() : 인증 초기 설정
 * download_nltk_data() : 말뭉치 다운로드
 * preprocess_review() : 리뷰 텍스트 소문자, 토큰화, 불용어 제거, 어간 처리

In [3]:
# 필수 라이브러리 불러오기
import os
import pandas as pd
import numpy as np
import nltk
import sys
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity

def init_api():
    with open("chatgpt.env") as env:
        for line in env:
            key, value = line.strip().split("=")
            os.environ[key] = value
    openai.api_key = os.environ.get("API_KEY")
    openai.organization = os.environ.get("ORG_ID")

def download_nltk_data():
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

def preprocess_review(review):
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer

    stopwords = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    tokens = nltk.word_tokenize(review.lower())
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

In [4]:
init_api()
download_nltk_data()

In [7]:
# 사용자로부터 커피 이름 입력 받기
input_coffee_name = input("Enter a coffee name: ")

# CSV 파일 불러오기(데모로 처리 속도 향상을 위해 단 50행만 불러온다.
df = pd.read_csv('simplified_coffee.csv', nrows=50)

# 리뷰 텍스트 전처리: 소문자로 변환, 토큰화, 불용어 제거, 어간 추출
df['preprocessed_review'] = df['review'].apply(preprocess_review)

# 각 리뷰에 대한 임베딩 가져오기
review_embeddings = []
for review in df['preprocessed_review']:
    review_embeddings.append(get_embedding(review, engine='text-embedding-ada-002'))

# 입력된 커피 이름의 인덱스 얻기
try:
    input_coffee_index = df[df['name'] == input_coffee_name].index[0]
except:
    print("죄송합니다. 입력한 커피를 찾을 수가 없습니다.")
    sys.exit()

# 입력된 커피 리뷰와 모든 다른 리뷰들 사이의 코사인 유사도 계산
similarities = []
input_review_embedding = review_embeddings[input_coffee_index]
for review_embedding in review_embeddings:
    similarity = cosine_similarity(input_review_embedding, review_embedding)
    similarities.append(similarity)

# 가장 유사한 리뷰들의 인덱스 얻기 (입력 커피 리뷰 자체를 제외)
most_similar_indices = np.argsort(similarities)[-6:-1]

# 가장 유사한 커피들의 이름 얻기
similar_coffee_names = df.iloc[most_similar_indices]['name'].tolist()

# 결과 출력
print("The most similar coffees to {} are:".format(input_coffee_name))
for coffee_name in similar_coffee_names:
    print(coffee_name)


Enter a coffee name: Ethiopia Yirgacheffe
The most similar coffees to Ethiopia Yirgacheffe are:
Kenya Gicherori
Estate Carbonic Maceration
“Naturals”
Finca Merced Guatemala
Civet Yirgacheffe Sisota


#### np.argsort(similarities)이해

In [10]:
similarities = [0.8, 0.5, 0.9, 0.6, 0.7, 0.4, 0.3, 0.2, 0.1, 0.0]
np.argsort(similarities)

array([9, 8, 7, 6, 5, 1, 3, 4, 0, 2])

In [11]:
print(similarities[0], similarities[1], similarities[2])

0.8 0.5 0.9


In [12]:
np.argsort(similarities)[-6:-1]

array([5, 1, 3, 4, 0])

#### 개선된 전체 코드

In [14]:
# 필수 라이브러리 불러오기
import os
import pandas as pd
import numpy as np
import nltk
import sys
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity

def init_api():
    with open("chatgpt.env") as env:
        for line in env:
            key, value = line.strip().split("=")
            os.environ[key] = value
    openai.api_key = os.environ.get("API_KEY")
    openai.organization = os.environ.get("ORG_ID")

def download_nltk_data():
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

def preprocess_review(review):
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer

    stopwords = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    tokens = nltk.word_tokenize(review.lower())
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

init_api()
download_nltk_data()

# 사용자로부터 커피 이름 입력 받기
input_coffee_name = input("Enter a coffee name: ")

# CSV 파일 불러오기(데모로 처리 속도 향상을 위해 단 50행만 불러온다.)
df = pd.read_csv('simplified_coffee.csv', nrows=50)

# 리뷰 텍스트 전처리: 소문자로 변환, 토큰화, 불용어 제거, 어간 추출
df['preprocessed_review'] = df['review'].apply(preprocess_review)

# 각 리뷰에 대한 임베딩 가져오기
review_embeddings = []
for review in df['preprocessed_review']:
    review_embeddings.append(get_embedding(review, engine='text-embedding-ada-002'))

# 입력 커피 이름의 인덱스 얻기
try:
    input_coffee_index = df[df['name'] == input_coffee_name].index[0]
except IndexError:
    # 각 커피 이름에 대한 임베딩 얻기
    print("Sorry, we don't have that coffee name in our database. We'll try to find the closest match.")
    name_embeddings = []
    for name in df['name']:
        name_embeddings.append(get_embedding(name, engine='text-embedding-ada-002'))

    # 입력된 커피 이름과 모든 다른 커피 이름들 사이의 코사인 유사도 검색
    input_coffee_embedding = get_embedding(input_coffee_name, engine='text-embedding-ada-002')
    _similarities = []
    for name_embedding in name_embeddings:
        _similarities.append(cosine_similarity(input_coffee_embedding, name_embedding))

    # 가장 유사한 커피 이름의 인덱스 얻기
    input_coffee_index = _similarities.index(max(_similarities))
except:
    print("Sorry, we don't have that coffee name in our database. Please try again.")
    exit()

# 입력된 커피 리뷰와 모든 다른 리뷰들 사이의 코사인 유사도 계산
similarities = []
input_review_embedding = review_embeddings[input_coffee_index]
for review_embedding in review_embeddings:
    similarity = cosine_similarity(input_review_embedding, review_embedding)
    similarities.append(similarity)

# 가장 유사한 리뷰들의 인덱스 얻기 (입력 커피 리뷰 자체를 제외)
most_similar_indices = np.argsort(similarities)[-6:-1]

# 가장 유사한 커피들의 이름 얻기
similar_coffee_names = df.iloc[most_similar_indices]['name'].tolist()

# 결과 출력
print("The most similar coffees to {} are:".format(input_coffee_name))
for coffee_name in similar_coffee_names:
    print(coffee_name)


Enter a coffee name: Ethiopia Yirgacheffe
The most similar coffees to Ethiopia Yirgacheffe are:
Kenya Gicherori
Estate Carbonic Maceration
“Naturals”
Finca Merced Guatemala
Civet Yirgacheffe Sisota


### 임베딩으로 뉴스 카테고리 예측하기


In [None]:
import os
import openai
import pandas as pd
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity

def init_api():
    with open("chatgpt.env") as env:
        for line in env:
            key, value = line.strip().split("=")
            os.environ[key] = value

    openai.api_key = os.environ.get("API_KEY")
    openai.organization = os.environ.get("ORG_ID")

init_api()


In [15]:
categories = [
    'U.S. NEWS',
    'COMEDY',
    'PARENTING',
    'WORLD NEWS',
    'CULTURE & ARTS',
    'TECH',
    'SPORTS'
    ]

# 문장을 분류하는 함수를 정의합니다.
def classify_sentence(sentence):
    # 문장의 임베딩을 얻습니다.
    sentence_embedding = get_embedding(sentence, engine="text-embedding-ada-002")
    # 문장과 각 카테고리 사이의 유사도 점수를 계산합니다.
    similarity_scores = {}
    for category in categories:
        category_embeddings = get_embedding(category, engine="text-embedding-ada-002")
        similarity_scores[category] = cosine_similarity(sentence_embedding, category_embeddings)
# 가장 높은 유사도 점수를 가진 카테고리를 반환합니다.
    return max(similarity_scores, key=similarity_scores.get)


#### 문장 분류하기

In [16]:
# 문장 분류하기
sentences = [
    "1 dead and 3 injured in El Paso, Texas, mall shooting",
    "Director Owen Kline Calls Funny Pages His ‘Self-Critical’ Debut",
    "15 spring break ideas for families that want to get away",
    "The US is preparing to send more troops to the Middle East",
    "Bruce Willis' 'condition has progressed' to frontotemporal dementia, his family says",
    "Get an inside look at Universal’s new Super Nintendo World",
    "Barcelona 2-2 Manchester United: Marcus Rashford shines but Raphinha salvages draw for hosts",
    "Chicago bulls win the NBA championship",
    "The new iPhone 12 is now available",
    "Scientists discover a new dinosaur species",
    "The new coronavirus vaccine is now available",
    "The new Star Wars movie is now available",
    "Amazon stock hits a new record high",
]

for sentence in sentences:
    print("{:50} category is {}".format(sentence, classify_sentence(sentence)))
    print()


1 dead and 3 injured in El Paso, Texas, mall shooting category is WORLD NEWS

Director Owen Kline Calls Funny Pages His ‘Self-Critical’ Debut category is COMEDY

15 spring break ideas for families that want to get away category is PARENTING

The US is preparing to send more troops to the Middle East category is WORLD NEWS

Bruce Willis' 'condition has progressed' to frontotemporal dementia, his family says category is WORLD NEWS

Get an inside look at Universal’s new Super Nintendo World category is WORLD NEWS

Barcelona 2-2 Manchester United: Marcus Rashford shines but Raphinha salvages draw for hosts category is SPORTS

Chicago bulls win the NBA championship             category is SPORTS

The new iPhone 12 is now available                 category is TECH

Scientists discover a new dinosaur species         category is WORLD NEWS

The new coronavirus vaccine is now available       category is WORLD NEWS

The new Star Wars movie is now available           category is WORLD NEWS

Amazo

#### 카테고리 분류 최종 코드

In [17]:
# 필수 라이브러리 불러오기
import os
import openai
import pandas as pd
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity

def init_api():
    with open("chatgpt.env") as env:
        for line in env:
            key, value = line.strip().split("=")
            os.environ[key] = value
    openai.api_key = os.environ.get("API_KEY")
    openai.organization = os.environ.get("ORG_ID")

init_api()

categories = ["POLITICS", "WELLNESS", "ENTERTAINMENT", "TRAVEL", "STYLE & BEAUTY",
              "PARENTING", "HEALTHY LIVING", "QUEER VOICES", "FOOD & DRINK", "BUSINESS",
              "COMEDY", "SPORTS", "BLACK VOICES", "HOME & LIVING", "PARENTS"]

# 문장을 분류하는 함수 정의
def classify_sentence(sentence):
    # 문장의 임베딩 얻기
    sentence_embedding = get_embedding(sentence, engine="text-embedding-ada-002")

    # 문장과 각 카테고리 사이의 유사도 점수 계산
    similarity_scores = {}
    for category in categories:
        category_embeddings = get_embedding(category, engine="text-embedding-ada-002")
        similarity_scores[category] = cosine_similarity(sentence_embedding, category_embeddings)

    # 가장 높은 유사도 점수를 가진 카테고리 반환
    return max(similarity_scores, key=similarity_scores.get)

# 문장 분류
sentences = [
    "1 dead and 3 injured in El Paso, Texas, mall shooting",
    "Director Owen Kline Calls Funny Pages His 'Self-Critical' Debut",
    "15 spring break ideas for families that want to get away",
    "The US is preparing to send more troops to the Middle East",
    "Bruce Willis' 'condition has progressed' to frontotemporal dementia, his family says",
    "Get an inside look at Universal's new Super Nintendo World",
    "Barcelona 2-2 Manchester United: Marcus Rashford shines but Raphinha salvages draw for hosts",
    "Chicago bulls win the NBA championship",
    "The new iPhone 12 is now available",
    "Scientists discover a new dinosaur species",
    "The new coronavirus vaccine is now available",
    "The new Star Wars movie is now available",
    "Amazon stock hits a new record high",
]

for sentence in sentences:
    print("{:50} category is {}".format(sentence, classify_sentence(sentence)))

print()


1 dead and 3 injured in El Paso, Texas, mall shooting category is ENTERTAINMENT
Director Owen Kline Calls Funny Pages His 'Self-Critical' Debut category is COMEDY
15 spring break ideas for families that want to get away category is TRAVEL
The US is preparing to send more troops to the Middle East category is TRAVEL
Bruce Willis' 'condition has progressed' to frontotemporal dementia, his family says category is WELLNESS
Get an inside look at Universal's new Super Nintendo World category is ENTERTAINMENT
Barcelona 2-2 Manchester United: Marcus Rashford shines but Raphinha salvages draw for hosts category is SPORTS
Chicago bulls win the NBA championship             category is SPORTS
The new iPhone 12 is now available                 category is BUSINESS
Scientists discover a new dinosaur species         category is TRAVEL
The new coronavirus vaccine is now available       category is WELLNESS
The new Star Wars movie is now available           category is ENTERTAINMENT
Amazon stock hits a

### 제로샷(Zero-Shot)분류기의 정확도 평가하기

#### 캐글 데이터 셋 - 제로샷(Zero-Shot)

#### 데이터 읽기

In [19]:
df = pd.read_json("data/News_Category_Dataset_v3.json", lines = True ).head(20)
df

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
5,https://www.huffpost.com/entry/belk-worker-fou...,Cleaner Was Dead In Belk Bathroom For 4 Days B...,U.S. NEWS,The 63-year-old woman was seen working at the ...,,2022-09-22
6,https://www.huffpost.com/entry/reporter-gets-a...,Reporter Gets Adorable Surprise From Her Boyfr...,U.S. NEWS,"""Who's that behind you?"" an anchor for New Yor...",Elyse Wanshel,2022-09-22
7,https://www.huffpost.com/entry/puerto-rico-wat...,Puerto Ricans Desperate For Water After Hurric...,WORLD NEWS,More than half a million people remained witho...,"DÁNICA COTO, AP",2022-09-22
8,https://www.huffpost.com/entry/mija-documentar...,How A New Documentary Captures The Complexity ...,CULTURE & ARTS,"In ""Mija,"" director Isabel Castro combined mus...",Marina Fang,2022-09-22
9,https://www.huffpost.com/entry/biden-un-russia...,Biden At UN To Call Russian War An Affront To ...,WORLD NEWS,White House officials say the crux of the pres...,"Aamer Madhani, AP",2022-09-21


In [20]:
from sklearn.metrics import precision_score

def evaluate_precision(categories):
    # 데이터 셋을 불러옵니다.
    df = pd.read_json( "data/News_Category_Dataset_v3.json" , lines = True ).head( 20 )
    y_true = [ ]
    y_pred = [ ]

    # 각 문장을 분류합니다.
    for _, row in df.iterrows():
        true_category = row[ 'category' ]
        predicted_category = classify_sentence(row[ 'headline' ])
        y_true.append(true_category)
        y_pred.append(predicted_category)

        # 아래 주석을 해제하면 정확한 예측과 오류 예측을 출력합니다.
        # if true_category != predicted_category:
        #    print("False prediction: {:50} True: {:20} Pred:
        #    {:20}".format(row['headline'], true_category, predicted_category))
        # else:
        #    print("True prediction: {:50} True: {:20} Pred:
        #    {:20}".format(row['headline'], true_category, predicted_category))

    # 정밀도 점수를 계산합니다.
    return precision_score(y_true, y_pred, average = 'micro' , labels = categories)


#### 제로샷(Zero-Shot) 분류기 최종 코드

In [21]:
# 필수 라이브러리 불러오기
import os
import openai
import pandas as pd
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
from sklearn.metrics import precision_score

def init_api():
    with open ( "chatgpt.env" ) as env:
        for line in env:
            key, value = line.strip().split( "=" )
            os.environ[key] = value
    openai.api_key = os.environ.get( "API_KEY" )
    openai.organization = os.environ.get( "ORG_ID" )

init_api()

categories = [ "POLITICS", "WELLNESS" , "ENTERTAINMENT" , "TRAVEL" , "STYLE & BEAUTY" ,
              "PARENTING", "HEALTHY LIVING", "QUEER VOICES", "FOOD & DRINK", "BUSINESS",
               "COMEDY", "SPORTS", "BLACK VOICES", "HOME & LIVING", "PARENTS", ]

# 문장을 분류하는 함수를 정의하기
def classify_sentence(sentence):
    # 문장의 임베딩을 얻습니다.
    sentence_embedding = get_embedding(sentence, engine = "text-embedding-ada-002" )

    # 문장과 각 카테고리 사이의 유사도를 계산합니다.
    similarity_scores = {}
    for category in categories:
        category_embeddings = get_embedding(category, engine = "text-embedding-ada-002" )
        similarity_scores[category] = cosine_similarity(sentence_embedding, category_embeddings)

    # 가장 높은 유사도 점수를 가진 카테고리를 반환합니다.
    return max (similarity_scores, key = similarity_scores.get)

def evaluate_precision(categories):
    # 데이터 셋을 로드합니다.
    df = pd.read_json("data/News_Category_Dataset_v3.json", lines = True ).head(20)
    y_true = []
    y_pred = []

    # 각 문장을 분류합니다.
    for _, row in df.iterrows():
        true_category = row[ 'category' ]
        predicted_category = classify_sentence(row[ 'headline' ])
        y_true.append(true_category)
        y_pred.append(predicted_category)

    # 정밀도 점수를 계산합니다.
    return precision_score(y_true, y_pred, average = 'micro' , labels = categories)

# 정밀도를 평가하고 결과를 출력합니다.
precision_evaluated = evaluate_precision(categories)
print ( "Precision: {:.2f} " . format (precision_evaluated))

Precision: 0.15 
