In [49]:
import requests
import PyPDF2
import openai  # Make sure you've set your OpenAI API key
import pandas as pd

# Define the list of PDF URLs
pdf_urls = [
    "https://www.sec.gov/files/form1.pdf",
    "https://www.sec.gov/files/form10.pdf",
    "https://www.sec.gov/files/form11-k.pdf",
    "https://www.sec.gov/files/form8-a.pdf",
    "https://www.sec.gov/files/formn-54c.pdf"
]

# Initialize an empty list to store the extracted text
pdf_texts = []

# Function to extract text from a PDF URL
def extract_text_from_pdf(pdf_url):
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        
        # Open the PDF file from the response content
        with open('temp.pdf', 'wb') as pdf_file:
            pdf_file.write(response.content)

        # Extract text from the PDF
        pdf_text = ""
        with open('temp.pdf', 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                pdf_text += page.extract_text()

        return pdf_text
    except Exception as e:
        print(f"Error extracting text from {pdf_url}: {str(e)}")
        return None

# Extract text from each PDF URL and store it in pdf_texts
for pdf_url in pdf_urls:
    pdf_text = extract_text_from_pdf(pdf_url)
    if pdf_text:
        pdf_texts.append(pdf_text)

# Clean up - remove temporary PDF file
import os
os.remove('temp.pdf')

# Initialize OpenAI API key
api_key = ""  # Replace with your actual OpenAI API key
openai.api_key = api_key

# Define a function to chunk text into smaller sections
def chunk_text(text, max_chunk_length=1200):
    chunks = []
    current_chunk = ""
    current_length = 0
    for paragraph in text.split("\n"):
        if current_length + len(paragraph) <= max_chunk_length:
            current_chunk += paragraph + "\n"
            current_length += len(paragraph)
        else:
            chunks.append(current_chunk)
            current_chunk = paragraph + "\n"
            current_length = len(paragraph)
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

# Define a function to generate embeddings from text
def generate_text_embeddings(text, model="text-embedding-ada-002"):
    response = openai.Embedding.create(model=model, input=text)
    return response['data'][0]['embedding']

# Initialize data structures to store the data
data = []

# Process the extracted text, chunk it, and generate embeddings
for i, text in enumerate(pdf_texts):
    text_chunks = chunk_text(text)
    embeddings = [generate_text_embeddings(chunk) for chunk in text_chunks]
    
    # Store the data
    for j, chunk in enumerate(text_chunks):
        data.append({
            'PDF': i + 1,
            'Chunk Number': j + 1,
            'Chunk Text': chunk,
            'Embedding': embeddings[j]
        })

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_file_path = 'pdf_data.csv'
df.to_csv(csv_file_path, index=False)

print(f"Data saved to {csv_file_path}")


Data saved to pdf_data.csv


In [None]:
%pip install openai

In [None]:
%pip install -U huggingface_hub

In [4]:
%pip install --upgrade transformers


UsageError: Line magic function `%pip3` not found.


In [33]:
import pandas as pd
from scipy.spatial import distance
import ast
import openai
from transformers import GPT2TokenizerFast

# Define the GPT-3 model and other parameters
GPT_MODEL = "gpt-3.5-turbo"
api_key = ""  # Replace with your actual OpenAI API key
openai.api_key = api_key

# Load the CSV file with embeddings
embeddings_file_path = '/Users/vivekhanagoji/Desktop/DAMG7245_Assignment02/DAMG7245_02/pdf_data.csv'  # Update with the path to your CSV file
df = pd.read_csv(embeddings_file_path)

# Convert the embeddings from string to list
df['Embedding'] = df['Embedding'].apply(ast.literal_eval)

def num_tokens(text):
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    encoding = tokenizer.encode(text, add_special_tokens=False)
    return len(encoding)

# Define a function to calculate cosine similarity
def cosine_similarity(embedding1, embedding2):
    return 1 - distance.cosine(embedding1, embedding2)

# Define a search function
def strings_ranked_by_relatedness(query, df, relatedness_fn=cosine_similarity, top_n=100):
    query_embedding = generate_text_embeddings(query)  # Implement this function using the OpenAI Text Embedding API
    strings_and_relatednesses = [
        (row['Chunk Text'], relatedness_fn(query_embedding, row['Embedding']))
        for _, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

# Define a function to generate embeddings from text using OpenAI Text Embedding API
def generate_text_embeddings(text, model="text-embedding-ada-002"):
    response = openai.Embedding.create(model=model, input=text)
    return response['data'][0]['embedding']

token_budget = 4096 - 500  # Adjust the token budget as needed

# Define a function to create a message for GPT
def query_message(query, df, token_budget):
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below PDFs on the SEC forms to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f'\n\nQuestion: {query}'
    message = introduction

    # Process each section separately
    for string in strings:
        # Split the content into smaller sections, e.g., paragraphs
        sections = string.split('\n\n')  # You can use a more appropriate separator
        
        for section in sections:
            next_section = f'\n\nSection:\n"""\n{section}\n"""'
            if num_tokens(message + next_section + question) > token_budget:
                break
            else:
                message += next_section

    # return message
    return message + question

# Define a function to answer questions using GPT
def ask(query, df, GPT_MODEL, token_budget, print_message=False):
    message = query_message(query, df, token_budget=token_budget)
    if print_message:
        # print(message)
        messages = [
            {"role": "system", "content": "You answer questions about the SEC pdfs"},
            {"role": "user", "content": message},
        ]
        response = openai.ChatCompletion.create(
            model=GPT_MODEL,
            messages=messages,
            temperature=0
        )
        response_message = response["choices"][0]["message"]["content"]
        # answer = response_message.split("Section:\n")[0]
    return response_message
# Example usage
query = "WHERE TO FILE AND NUMBER OF COPIES?"
response = ask(query, df, GPT_MODEL, token_budget, print_message=True)
print(response)



Token indices sequence length is longer than the specified maximum sequence length for this model (1184 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1489 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1783 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2107 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2584 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Submit one original and two copies of Form 1 to: SEC, Division of Market Regulation, Office of Market Supervision, 450 Fifth Street, N.W., Washington, DC 20549.


In [34]:
df['Embedding']

0     [-0.014180930331349373, 0.002201310358941555, ...
1     [-0.016180770471692085, 0.0052414629608392715,...
2     [-0.011705736629664898, 0.0038264915347099304,...
3     [0.0009659164352342486, -0.014861313626170158,...
4     [-0.02527722530066967, -0.009905753657221794, ...
5     [-0.0021996337454766035, -0.006497273687273264...
6     [-0.011366186663508415, 0.019283602014183998, ...
7     [-0.014121945947408676, 0.012033876031637192, ...
8     [-0.017194781452417374, -0.004264927003532648,...
9     [-0.015669923275709152, 0.010231457650661469, ...
10    [-0.017910484224557877, 0.0021017403341829777,...
11    [-0.01737760566174984, -0.006793549284338951, ...
12    [-0.002941735088825226, 0.009172001853585243, ...
13    [-0.008488462306559086, 0.009436970576643944, ...
14    [-0.015242019668221474, 0.004137411247938871, ...
15    [-0.004700277000665665, 0.011502498760819435, ...
16    [-0.015020672231912613, -0.002799875568598509,...
17    [-0.010398353450000286, -0.006963482126593

In [20]:
# Example usage
query = "WHERE TO FILE AND NUMBER OF COPIES?"
response = ask(query, df, GPT_MODEL, token_budget, print_message=True)
print(response)

Token indices sequence length is longer than the specified maximum sequence length for this model (1184 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1489 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1783 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2107 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2584 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Use the below PDFs on the SEC forms to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."

Section:
"""
 Attach  an Execution  Page (Page  1) with original  manual  signatures.  
 Please  type all information.  
 Use only the current  version  of Form  1 or a reproduction.  
5. If the information called for by any Exhibit is available in printed form, the printed material may be filed, provided it doe s 
not exceed 8 1/2 X 11 inches in size.  
6. If any Exhibit  required  is inapplicable,  a statement to that effect  shall be furnished  in lieu of such Exhibit.  
7. An exchange  that is filing  Form  1 as an application  may not satisfy  the requirements  to provide  certain  information  by 
means  of an Internet  web page. All  materials  must be filed with the Commission  in paper.  
 
8. WHERE TO FILE AND NUMBER OF COPIES  - Submit one original and two copies of Form 1 to: SEC, Division of Market 
Regulation, Office

In [None]:
%pip install boto3


In [None]:
%pip install s3fs

In [None]:
import boto3
import io
# Initialize FastAPI app
# app = FastAPI()

s3 = boto3.client('s3')
bucket_name = 'embeddingspdf'


s3 = boto3.client('s3')
path = "s3://embeddingspdf/pdf_data.csv"
df=pd.read_csv(path)

In [None]:
import boto3
import pandas as pd

# Configure your AWS credentials
aws_access_key_id = ''
aws_secret_access_key = ''
region_name = ''

# Create an S3 client
s3 = boto3.client('s3', aws_access_key_id='YOUR_ACCESS_KEY_ID', aws_secret_access_key='YOUR_SECRET_ACCESS_KEY', region_name='your-aws-region')


# Specify S3 bucket name and file path
bucket_name = 'embeddingspdf'
path = 's3://embeddingspdf/pdf_data.csv'

# Read the CSV file from S3
df = pd.read_csv(path)
