<a href="https://colab.research.google.com/github/K-3-LT/defacto_global_bu/blob/main/Pinecone_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Embedding

In [None]:
!pip install python-docx openai transformers pinecone-client

In [None]:
import docx
import openai
import csv
from pathlib import Path
from transformers import GPT2Tokenizer
from google.colab import files
import pinecone
from tqdm import tqdm

In [None]:
# read word doc
def read_docx(file_path):
  doc = docx.Document(file_path)
  #files.upload()
  text = []
  for paragraph in doc.paragraphs:
    # skip links
    if 'http://' not in paragraph.text and 'https://' not in paragraph.text:
      text.append(paragraph.text)
  return '\n'.join(text)

In [None]:
content = read_docx('/content/defacto.docx')

# split content into small paragraphs
paragraphs = content.split('\n')

# Input openai api
api_key = input("Please ensure proper inputting of OpenAI API key:")
openai.api_key = api_key

# Tokenize paragraph
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Make sure each section is less than 4096
sections = []
current_section = ""
for paragraph in paragraphs:
  tokens = tokenizer.encode(paragraph, add_special_tokens=False)
  if len(tokens) + len(tokenizer.encode(current_section, add_special_tokens=False)) > 4000:
    current_section += '\n'
    sections.append(current_section)
    current_section = paragraph
  else:
    current_section += f'\n{paragraph}'

if current_section:
  sections.append(current_section)

# transfer paragraph into embedding
def create_embedding(section):
  model_engine = "text-embedding-ada-002"
  openai.api_key = api_key
  response = openai.Embedding.create(
      model=model_engine,
      input=section,
  )
  return response['data'][0]['embedding']

embeddings = [create_embedding(section) for section in tqdm(sections)]

print(embeddings)

In [None]:
# Pinecone API key
pinecone_api_key = input("Please enter your Pinecone API key: ")
pinecone_environment = input("Please enter your Pinecone Environment: ")

# Pinecone index name
index_name = input("please enter your Pinecone vector database index name(only lower-case characters or number): ")

# Initial Pinecone
pinecone.init(api_key=pinecone_api_key,environment=pinecone_environment)

# Create a new Pinecone index
pinecone.create_index(name = index_name, dimension=len(embeddings[0]),metric='cosine', pod_type='p2')

# Access index instance
index = pinecone.Index(index_name=index_name)

# Create metadata list
metadata = [{"report_context": section} for section in sections]

# upsert vectors and metadata into index
for i, (embedding, meta) in enumerate(zip(embeddings, metadata)):
  index.upsert(vectors=[(str(i), embedding, meta)])