In [None]:
pip install openai==0.28



In [None]:
# prompt: mount my drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install PyMuPDF



In [None]:
import openai

PDF to JSONL,Text&CSV

In [None]:
import openai
import fitz  # PyMuPDF
import time
import json
import re
import os
import csv  # Import CSV module

# Enter your OpenAI API key here
openai.api_key = ''

# Define the folder paths at the top of the script
pdf_folder_path = '/content/drive/Shareddrives/DATA298B_Final/Data/Raw/Output_Preprocessed_Data/5'  # Folder containing PDF files
output_folder_jsonl = '/content/drive/Shareddrives/DATA298B_Final/Data/QA_Jsonls'  # Folder to save JSONL files
output_folder_text = '/content/drive/Shareddrives/DATA298B_Final/Data/QA_text'  # Folder to save text files
output_folder_csv = '/content/drive/Shareddrives/DATA298B_Final/Data/QA_csv'  # Folder to save CSV files
model_name = 'gpt-4o-mini'  # Define the model name here

# Function to extract text from a PDF using PyMuPDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Function to chunk text into smaller sizes (aiming for less than 1000 tokens per chunk)
def chunk_text(text, max_tokens=1000):
    chunks = []
    current_chunk = ""
    for paragraph in text.split('\n\n'):  # Assuming paragraphs are separated by double newlines
        if len(current_chunk.split()) + len(paragraph.split()) < max_tokens:  # Ensure chunk is less than max_tokens
            current_chunk += paragraph + "\n\n"
        else:
            chunks.append(current_chunk)
            current_chunk = paragraph + "\n\n"
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

# Function to generate Q&A pairs from a chunk of text
def generate_qa_pairs(chunk):
    response = openai.ChatCompletion.create(
        model=model_name,  # Use the global model name
        messages=[
            {"role": "system", "content": "You are a helpful Legal Chatbot Assistant."},
            {"role": "user", "content": f"Generate 100 Q&A pairs from the following text to be used for future fine-tuning. These are legal documents. Make the questions as close to what a questions a layman would ask after looking at legal cases,policies and legal documents. Give as much context and details as possible. Include legal terms and :\n\n{chunk}"}
        ],
        max_tokens=15000,  # Adjust max_tokens as required
        temperature=0.7
    )

    # Log the raw API response for debugging
    print("\nRaw API Response:\n", response['choices'][0]['message']['content'])

    return response['choices'][0]['message']['content'].strip()

# Function to process the generated response and remove ** characters
def process_qa_response(qa_pair_text):
    qa_pairs = []

    # Remove ** characters from the text
    qa_pair_text_cleaned = qa_pair_text.replace("**", "").strip()

    # Extract Q&A pairs using a simple split
    qa_list = qa_pair_text_cleaned.split("\n\n")

    for qa in qa_list:
        if "Q:" in qa and "A:" in qa:
            question = qa.split("Q:", 1)[1].split("A:", 1)[0].strip()  # Extract question
            answer = qa.split("A:", 1)[1].strip()  # Extract answer
            qa_pairs.append((question, answer))

    return qa_pairs

# Function to handle API rate limiting by adding a delay and retry if needed
def generate_qa_with_rate_limit(chunks):
    qa_pairs = []
    for chunk in chunks:
        try:
            response = generate_qa_pairs(chunk)
            qa_pairs.extend(process_qa_response(response))
            time.sleep(1)  # Add a delay to prevent rate limiting
        except openai.error.RateLimitError:
            print("Rate limit reached, waiting for 60 seconds...")
            time.sleep(60)  # Wait for 60 seconds if rate limit is hit
            response = generate_qa_pairs(chunk)  # Retry after waiting
            qa_pairs.extend(process_qa_response(response))
        except openai.error.APIError as e:
            print(f"Error occurred: {e}. Retrying...")
            time.sleep(5)  # Short wait before retrying
            response = generate_qa_pairs(chunk)
            qa_pairs.extend(process_qa_response(response))
    return qa_pairs

# Function to process all PDFs in the folder
def process_pdf_folder(pdf_folder, output_folder_jsonl, output_folder_text, output_folder_csv):
    # Get all PDF files in the folder
    pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder, pdf_file)
        output_file_path_jsonl = os.path.join(output_folder_jsonl, f"{os.path.splitext(pdf_file)[0]}_qa_pairs.jsonl")
        output_file_path_text = os.path.join(output_folder_text, f"{os.path.splitext(pdf_file)[0]}_qa_pairs.txt")
        output_file_path_csv = os.path.join(output_folder_csv, f"{os.path.splitext(pdf_file)[0]}_qa_pairs.csv")

        print(f"Processing {pdf_file}...")

        # Extract text from the PDF
        pdf_text = extract_text_from_pdf(pdf_path)

        # Chunk the extracted text
        chunks = chunk_text(pdf_text)

        # Generate Q&A pairs for each chunk
        qa_pairs = generate_qa_with_rate_limit(chunks)

        # Check if Q&A pairs are generated
        if len(qa_pairs) == 0:
            print(f"No Q&A pairs generated for {pdf_file}. Please check the input and API responses.")
            continue
        else:
            print(f"Generated {len(qa_pairs)} Q&A pairs for {pdf_file}.")

        # Save the Q&A pairs in the required JSONL format
        try:
            with open(output_file_path_jsonl, 'w') as f_jsonl:
                for question, answer in qa_pairs:
                    json_obj = {
                        "messages": [
                            {"role": "system", "content": "You are a helpful Legal Chatbot Assistant."},
                            {"role": "user", "content": question},
                            {"role": "assistant", "content": answer}
                        ]
                    }
                    f_jsonl.write(json.dumps(json_obj) + "\n")  # Writing each entry to a new line
            print(f"Q&A pairs for {pdf_file} have been successfully saved in JSONL format to {output_file_path_jsonl}")
        except Exception as e:
            print(f"Error writing {pdf_file} to JSONL file: {e}")

        # Save the cleaned Q&A pairs in a text file
        try:
            with open(output_file_path_text, 'w') as f_text:
                for question, answer in qa_pairs:
                    f_text.write(f"Q: {question}\nA: {answer}\n\n")
            print(f"Q&A pairs for {pdf_file} have been successfully saved in text format to {output_file_path_text}")
        except Exception as e:
            print(f"Error writing {pdf_file} to text file: {e}")

        # Save the Q&A pairs in a CSV file
        try:
            with open(output_file_path_csv, 'w', newline='') as f_csv:
                writer = csv.writer(f_csv)
                writer.writerow(["Question", "Answer"])  # Write headers
                for question, answer in qa_pairs:
                    writer.writerow([question, answer])  # Write question and answer in separate columns
            print(f"Q&A pairs for {pdf_file} have been successfully saved in CSV format to {output_file_path_csv}")
        except Exception as e:
            print(f"Error writing {pdf_file} to CSV file: {e}")

# Process all PDFs in the input folder
process_pdf_folder(pdf_folder_path, output_folder_jsonl, output_folder_text, output_folder_csv)


Processing Medtronic U.S. Patient Privacy Principles.pdf_summary_output_wrapped.pdf...


KeyboardInterrupt: 

Combine JSONLs

In [None]:
import os
import json

# Enter the path to the folder where JSONL files are saved
output_folder_path = '/content/drive/Shareddrives/DATA298B_Final/Data/QA_Jsonls'

# Enter the path for the combined output JSONL file
combined_output_file = '/content/drive/Shareddrives/DATA298B_Final/Data/Final_QA_OpenAI/final_qa_openai.jsonl'

In [None]:


# Function to combine JSONL files
def combine_jsonl_files(output_folder, combined_file):
    jsonl_files = [f for f in os.listdir(output_folder) if f.endswith('.jsonl')]

    all_data = []

    # Read all JSONL files and store the contents
    for jsonl_file in jsonl_files:
        file_path = os.path.join(output_folder, jsonl_file)
        with open(file_path, 'r') as f:
            for line in f:
                all_data.append(line.strip())  # Strip newlines and store each line

    # Write combined data into one file
    with open(combined_file, 'w') as out_file:
        for i, line in enumerate(all_data):
            out_file.write(line)
            if i < len(all_data) - 1:
                out_file.write("\n")  # Add newlines between all lines except after the last one

    print(f"All JSONL files have been combined into {combined_file}")

# Combine the JSONL files in the output folder
combine_jsonl_files(output_folder_path, combined_output_file)


All JSONL files have been combined into /content/drive/Shareddrives/DATA298B_Final/Data/Final_QA_OpenAI/final_qa_openai.jsonl


In [None]:
import json

# Define file paths
input_file_path = '/content/drive/Shareddrives/DATA298B_Final/Data/Final_QA_OpenAI/final_qa_openai.jsonl'
output_file_path = '/content/drive/Shareddrives/DATA298B_Final/Data/Final_QA_OpenAI/final_qa_mixtral.jsonl'

# Process each line to remove characters from position 15 to 91
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for line in infile:
        # Remove characters from position 15 to 91
        trimmed_line = line[:14] + line[91:]
        outfile.write(trimmed_line)

print(f"Trimmed JSONL file created at: {output_file_path}")


Trimmed JSONL file created at: /content/drive/Shareddrives/DATA298B_Final/Data/Final_QA_OpenAI/final_qa_mixtral.jsonl


Combine CSVs

In [None]:
import os
import csv

# Enter the path to the folder where CSV files are saved
output_folder_path = '/content/drive/Shareddrives/DATA298B_Final/Data/QA_csv'

# Enter the path for the combined output CSV file
combined_output_file = '/content/drive/Shareddrives/DATA298B_Final/Data/Final_QA_OpenAI/final_qa_openai.csv'

# Function to combine CSV files
def combine_csv_files(output_folder, combined_file):
    csv_files = [f for f in os.listdir(output_folder) if f.endswith('.csv')]

    all_data = []
    header_written = False

    # Open the combined CSV file for writing
    with open(combined_file, 'w', newline='') as out_file:
        writer = csv.writer(out_file)

        # Loop through each CSV file
        for csv_file in csv_files:
            file_path = os.path.join(output_folder, csv_file)

            with open(file_path, 'r') as f:
                reader = csv.reader(f)

                # Write header only once
                if not header_written:
                    header = next(reader)  # Extract the header from the first CSV file
                    writer.writerow(header)
                    header_written = True
                else:
                    next(reader)  # Skip the header for all other files

                # Write the content of each CSV file
                for row in reader:
                    writer.writerow(row)

    print(f"All CSV files have been combined into {combined_file}")

# Combine the CSV files in the output folder
combine_csv_files(output_folder_path, combined_output_file)


All CSV files have been combined into /content/drive/Shareddrives/DATA298B_Final/Data/Final_QA_OpenAI/final_qa_openai.csv


Combine Text Files

In [None]:
import os

# Enter the path to the folder where text files are saved
output_folder_path = '/content/drive/Shareddrives/DATA298B_Final/Data/QA_text'

# Enter the path for the combined output text file
combined_output_file = '/content/drive/Shareddrives/DATA298B_Final/Data/Final_QA_OpenAI/final_qa_openai.txt'

# Function to combine text files
def combine_text_files(output_folder, combined_file):
    text_files = [f for f in os.listdir(output_folder) if f.endswith('.txt')]

    all_data = []

    # Read all text files and store the contents
    for text_file in text_files:
        file_path = os.path.join(output_folder, text_file)
        with open(file_path, 'r') as f:
            all_data.append(f.read().strip())  # Read the file and strip unnecessary newlines

    # Write combined data into one file
    with open(combined_file, 'w') as out_file:
        for i, text in enumerate(all_data):
            out_file.write(text)
            if i < len(all_data) - 1:
                out_file.write("\n\n")  # Add a newline between the contents of each file, except the last one

    print(f"All text files have been combined into {combined_file}")

# Combine the text files in the output folder
combine_text_files(output_folder_path, combined_output_file)


All text files have been combined into /content/drive/Shareddrives/DATA298B_Final/Data/Final_QA_OpenAI/final_qa_openai.txt
