In [1]:
%pip install --upgrade --user --quiet google-cloud-aiplatform
%pip install pandas_gbq
%pip install PyPDF2
%pip install textblob

[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
# # Restart kernel after installs so that your environment can access the new packages
# import pandas as pd
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
# Define project information
PROJECT_ID = "jc-gcp-project"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

## Import Library

In [2]:
import json
import pandas as pd
import IPython
from IPython.display import display, display_pdf, IFrame
from vertexai.generative_models import (
    GenerativeModel,
    HarmCategory,
    HarmBlockThreshold,
    Part,
)

In [2]:
import pandas_gbq

In [3]:
model = GenerativeModel("gemini-1.5-pro-preview-0409")

generation_config = {
    "temperature": 0.0,
    "response_mime_type": "application/json",
}
safety_settings = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
}

In [4]:
def get_file_bytes(file_path: str) -> bytes:
    with open(file_path, "rb") as file:
        return file.read()


def get_url_from_gcs(gcs_uri: str) -> str:
    # converts gcs uri to url for image display.
    return "https://storage.googleapis.com/" + gcs_uri.replace("gs://", "").replace(
        " ", "%20"
    )


def print_multimodal_prompt(contents: list):
    """
    Given contents that would be sent to Gemini,
    output the full multimodal prompt for ease of readability.
    """
    for content in contents:
        if isinstance(content, Part):
            if content.inline_data:
                display_pdf(content.inline_data.data)
            elif content.file_data:
                display(
                    IFrame(
                        get_url_from_gcs(content.file_data.file_uri),
                        width=600,
                        height=300,
                    )
                )
        else:
            print(content)


# Send Google Cloud Storage Document to Google Cloud Storage
def process_document(
    prompt: str,
    file_uri: str,
    mime_type: str = "text/plain",
#        mime_type: str = "application/pdf",

    print_prompt: bool = False,
    print_raw_response: bool = False,
) -> str:
    # Load file directly from Google Cloud Storage
    file_part = Part.from_uri(
        uri=file_uri,
        mime_type=mime_type,
    )

    # Load contents
    contents = [file_part, prompt]

    # Send to Gemini
    response = model.generate_content(
        contents, generation_config=generation_config, safety_settings=safety_settings
    )

    if print_prompt:
        print("-------Prompt--------")
        print_multimodal_prompt(contents)

    if print_raw_response:
        print("\n-------Raw Response--------")
        print(response)

    return response.text

## 설명서 데이터셋

In [7]:
import PyPDF2
from textblob import TextBlob

pdf_file = open('air_condition.pdf', 'rb')
reader = PyPDF2.PdfReader(pdf_file)

total_pages = len(reader.pages)
print(total_pages)

122


In [8]:
def divide_with_remainder(dividend, divisor):
    quotient = dividend // divisor
    remainder = dividend % divisor
    return quotient, remainder

quotient, remainder = divide_with_remainder(total_pages, 1000)
print(quotient)  # 2
print(remainder)  # 50

0
122


In [9]:
import os
from google.cloud import storage

# Set your project ID and bucket name
bucket_name = "jc-gcp-project-01"

# Specify the local file path and the Cloud Storage destination path
#local_file_path = "text0.txt"  # Replace with your local file path
#destination_blob_name = "text0.txt"  # Replace with your desired path in Cloud Storage

# Create a Storage Client object
storage_client = storage.Client(project=PROJECT_ID)

# Get a reference to the Cloud Storage bucket
bucket = storage_client.get_bucket(bucket_name)

In [55]:
pdf_file = open('air_cleaner.pdf', 'rb')
reader = PyPDF2.PdfReader(pdf_file)

total_pages = len(reader.pages)
print(total_pages)

36


In [56]:
## Air clean
page_no = 0
df_save = pd.DataFrame(columns=['page', 'text_exp', 'api'])
file_name = "text_total.txt"  #  str(file_no) 추가
with open(file_name, "w") as file:
    for page_no in range(total_pages):
       # print(f"\n\n**** PAGE {page_no + 1} ****\n\n")
        now_page = reader.pages[page_no]
        #file.write(str(file_no))  # str(file_no) 추가
        file.write(now_page.extract_text())
        # df_save.loc[page_no] = df_save.append({'page': page_no, 'desc': now_page.extract_text(), 'api':'air_cleaner'}, ignore_index=True)
        df_save.loc[page_no] = {'page': page_no, 'text_exp': now_page.extract_text(), 'api':'MANUAL_air_cleaner'}

In [57]:
pdf_file = open('air_condition.pdf', 'rb')
reader = PyPDF2.PdfReader(pdf_file)

total_pages = len(reader.pages)
print(total_pages)

122


In [58]:
## Air condition
page_no = 0
df_save_1 = pd.DataFrame(columns=['page', 'text_exp', 'api'])
file_name = "text_total_1.txt"  #  str(file_no) 추가
with open(file_name, "w") as file:
    for page_no in range(total_pages):
       # print(f"\n\n**** PAGE {page_no + 1} ****\n\n")
        now_page = reader.pages[page_no]
        #file.write(str(file_no))  # str(file_no) 추가
        file.write(now_page.extract_text())
        # df_save.loc[page_no] = df_save.append({'page': page_no, 'desc': now_page.extract_text(), 'api':'air_cleaner'}, ignore_index=True)
        df_save_1.loc[page_no] = {'page': page_no, 'text_exp': now_page.extract_text(), 'api':'MANUAL_air_condition'}

In [59]:
pdf_file = open('smart_tv.pdf', 'rb')
reader = PyPDF2.PdfReader(pdf_file)

total_pages = len(reader.pages)
print(total_pages)

147


In [60]:
## Smart tv
page_no = 0
df_save_2 = pd.DataFrame(columns=['page', 'text_exp', 'api'])
file_name = "text_total_1.txt"  #  str(file_no) 추가
with open(file_name, "w") as file:
    for page_no in range(total_pages):
       # print(f"\n\n**** PAGE {page_no + 1} ****\n\n")
        now_page = reader.pages[page_no]
        #file.write(str(file_no))  # str(file_no) 추가
        file.write(now_page.extract_text())
        # df_save.loc[page_no] = df_save.append({'page': page_no, 'desc': now_page.extract_text(), 'api':'air_cleaner'}, ignore_index=True)
        df_save_2.loc[page_no] = {'page': page_no, 'text_exp': now_page.extract_text(), 'api':'MANUAL_smart_tv'}

In [64]:
df_save_2['text_exp']

0                                                       
1      . . . . . . . . . . . . . . . . . . . . . . . ...
2      . . . . . . . . . . . . . . . . . . . . . . . ...
3      . . . . . . . . . . . . . . . . . . . . . . . ...
4      webOS \n시작하기\n시작하기\n홈\n홈\n \n둘러보기\n둘러보기\nLG we...
                             ...                        
142    1920 × 1080 \n(\n울트라\nHD 120/144 Hz, 8K 120/14...
143    3840 × 1080\n \n(144 Hz \n모델만\n \n해당\n)\n166.5...
144    4096 × 2160 \n(\n울트라\nHD, 8K \n모델만\n \n해당\n)\n...
145    263.74\n59.94\n264\n60\n \nHDMI Deep Color\n를\...
146    블루투스\n블루투스\n®\n \n워드마크\n \n및\n \n로고는\n Bluetoo...
Name: text_exp, Length: 147, dtype: object

In [65]:
### 설명서 데이터 합치기
df_master = pd.concat([df_save, df_save_1])
df_master = pd.concat([df_master, df_save_2])
print('Total page : ', len(df_master))
df_master = df_master[['text_exp', 'api']].reset_index(drop = True)
# 간단한 Text처리
df_master.replace({'\n' : ''}, regex=True, inplace=True)
df_master.replace({'   ' : ' '}, regex=True, inplace=True)

Total page :  305


## DF결과 간단 체크

In [67]:
df_master

Unnamed: 0,text_exp,api
0,www.lge.co.kr사용 전에 ‘안전을 위한 주의 사항’을 반드시 읽고 정확하게...,MANUAL_air_cleaner
1,차 례 본 설명서는 공용으로 제작되어 구입한 제품과 다른 이미지나 내용이 포함되어...,MANUAL_air_cleaner
2,고장 신고 전 확인 사항 ...................................,MANUAL_air_cleaner
3,04안전을 위한 주의 사항다음에 표기되어 있는 안전 관련한 주의 사항들은 제품을 안...,MANUAL_air_cleaner
4,05• 전원 플러그에 물기나 먼지를 완전히 제거한 후 콘센트에 단단히 꽂아 주십시오...,MANUAL_air_cleaner
...,...,...
300,"1920 × 1080 (울트라HD 120/144 Hz, 8K 120/144 Hz 모...",MANUAL_smart_tv
301,3840 × 1080 (144 Hz 모델만 해당)166.561443840 × 160...,MANUAL_smart_tv
302,"4096 × 2160 (울트라HD, 8K 모델만 해당)53.9423.97542456...",MANUAL_smart_tv
303,263.7459.9426460 HDMI Deep Color를 4K로 설정했을 때 지...,MANUAL_smart_tv


In [66]:
df_master[df_master['text_exp'].str.contains("사용하기냉방 기본 기")]

Unnamed: 0,text_exp,api
59,23 사용하기냉방 기본 기능 작동하기냉방 기능 사용하기제품의 냉방 기능이 작동하면 ...,MANUAL_air_condition
119,24 사용하기냉방 기본 기능 작동하기냉방 기능 사용하기제품의 냉방 기능이 작동하면 ...,MANUAL_air_condition


In [69]:
# 빅쿼리 테이블로 저장
pandas_gbq.to_gbq(df_master, "LG_test.manual_dataset")

100%|██████████| 1/1 [00:00<00:00, 7913.78it/s]


In [70]:
# CSV 저장
df_master.to_csv('manual_dataset.csv')

## 책 데이터셋

In [94]:
import PyPDF2
from textblob import TextBlob

pdf_file = open('sample_book.pdf', 'rb')
reader = PyPDF2.PdfReader(pdf_file)

total_pages = len(reader.pages)
print(total_pages)

394


In [95]:
## Air clean
page_no = 0
df_save = pd.DataFrame(columns=['page', 'text_exp', 'api'])
file_name = "text_total.txt"  #  str(file_no) 추가
with open(file_name, "w") as file:
    for page_no in range(total_pages):
       # print(f"\n\n**** PAGE {page_no + 1} ****\n\n")
        now_page = reader.pages[page_no]
        #file.write(str(file_no))  # str(file_no) 추가
        file.write(now_page.extract_text())
        # df_save.loc[page_no] = df_save.append({'page': page_no, 'desc': now_page.extract_text(), 'api':'air_cleaner'}, ignore_index=True)
        df_save.loc[page_no] = {'page': page_no, 'text_exp': now_page.extract_text(), 'api':'book_api'}

In [97]:
df_save = df_save[['text_exp', 'api']].reset_index(drop = True)

In [99]:
pandas_gbq.to_gbq(df_save, "LG_test.book_dataset")

100%|██████████| 1/1 [00:00<00:00, 13797.05it/s]


In [100]:
df_save.to_csv('book_dataset.csv')

## 빅쿼리 가이드

In [101]:
pdf_file = open('bigquery_guide.pdf', 'rb')
reader = PyPDF2.PdfReader(pdf_file)

total_pages = len(reader.pages)
print(total_pages)

71


In [102]:
## Air clean
page_no = 0
df_save = pd.DataFrame(columns=['page', 'text_exp', 'api'])
file_name = "text_total.txt"  #  str(file_no) 추가
with open(file_name, "w") as file:
    for page_no in range(total_pages):
       # print(f"\n\n**** PAGE {page_no + 1} ****\n\n")
        now_page = reader.pages[page_no]
        #file.write(str(file_no))  # str(file_no) 추가
        file.write(now_page.extract_text())
        # df_save.loc[page_no] = df_save.append({'page': page_no, 'desc': now_page.extract_text(), 'api':'air_cleaner'}, ignore_index=True)
        df_save.loc[page_no] = {'page': page_no, 'text_exp': now_page.extract_text(), 'api':'bigquery_api'}

In [103]:
df_save = df_save[['text_exp', 'api']].reset_index(drop = True)

In [104]:
pandas_gbq.to_gbq(df_save, "LG_test.bq_dataset")

100%|██████████| 1/1 [00:00<00:00, 9198.04it/s]


In [105]:
df_save.to_csv('bq_dataset.csv')

## Review Dataset

In [3]:
sql = "select Review as text_exp, 'Retaurant_api' as api from `jc-gcp-project.jc_demo_test.review_text_sample` where not (Review like '%티브이%'or Review like '%TV%'or Review like '%tv%'or Review like '%티비%' or Review like '%텔레비%' or Review like '%에어컨%' or Review like '%공기%'or Review like '%공기청정기%'or Review like '%공기 청정기%' or Review like '%청정기%' or Review like '%시원%'or Review like '%고장%' or Review like '%고쳐%' or Review like '%air%') limit 5000;"
df = pandas_gbq.read_gbq(sql, project_id='jc-gcp-project')

Downloading: 100%|[32m██████████[0m|


In [4]:
df.to_csv('review_dataset.csv')

In [None]:
1+1