In [137]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import re
import os
import pdfplumber
import json


### Scrape URL

In [138]:
from bs4 import BeautifulSoup, NavigableString
import requests

def extract_text(url):
    # Get the HTML of the page
    response = requests.get(url)
    html = response.text

    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find the divs with the specified classes
    elements = soup.find_all(["div", "section"], class_=["RichtextArea ProductPage__richtext text-wrapper", "layout--2col wrapper", "RichtextBlock ProductPage__richtextBlock"])

    # Extract the text of each child of each div until a h2, h3 or ul tag is encountered
    text = []
    for div in elements:
        for child in div.children:
            if isinstance(child, NavigableString):
                text.append(child.strip())
            else:
                text.append(child.get_text(strip=True))
    text_string = ' '.join(text)
        
    return text_string


### Scrape pdf

In [139]:
def scrape_pdf_text(url):
    # Download the PDF file
    response = requests.get(url)
    with open('temp.pdf', 'wb') as f:
        f.write(response.content)

    # Open the PDF file
    with pdfplumber.open('temp.pdf') as pdf:
        # Extract text from each page
        text = ''
        in_features_section = False
        if len(pdf.pages) > 10:
            return None
        for i, page in enumerate(pdf.pages):
            # If this is the last page, define a crop box that excludes the last 1 cm from the bottom
            if i == len(pdf.pages) - 1:
                crop_box = (0, 0, page.width, page.height - 70)
                page = page.crop(crop_box)

            page_text = page.extract_text().split('\n')
                        
            for line in page_text:
                if not in_features_section:
                    line = line.replace('®', '')  # Remove ® symbol
                    if line.startswith('•'):
                        text += '\n' + line  # Add bullet point to new line
                    else:
                        text += line + '\n'

    # Remove the temporary PDF file
    os.remove('temp.pdf')

    return text

### generate GPT blocks

In [140]:
system_prompt_json_blocks = """
    Read the whole file. Task: You are a content/format writer tasked with converting information from a PDF file into a a selected format. Only use the content found in the provided file. If there's insufficient information, insert "I don't know" in the respective block. Use the same sentences, same way of writing in the blocks if that is possible. Do not add any new information, and keep changes to a minimum, you are a formater, more than a content writer. Include a minimum of one and a maximum of three such blocks. If the product pdf offer more text, use three blocks. If the text has headings, use these headings. Have at elast 100 words in each block and add a suitable heading for each block. Do not use features or technical specification as heading the blocks or information from the features or technical specification as content. Do not use information about 'Manufacturer and Ordering Information' either.You do not need to use all the text to write the blocks, use the text that is most suitable.

    Write the following four content blocks with (0 - 300 words) per block:

    introduction: {
    heading: "Introduction",
    text: "Use the first text provided of the product to write a block of text that can be used as a introduction. Use the heading "introduction" and use the first paragraph in the text. 
    }

    "block1":{ 
    Provide factual information blocks explaining the value that the products bring. The product often have a suiteable heading, use this heading and the text after. 
    }

    "block2":{
    Use the information provided of the product to write a block of text that describes the product or a different side of the product. The product often have a suiteable heading, use this heading and the text after. 
    }

    "block3":{ 
    Use the information provided of the product to write a block of text that describes the product. The product often have a suiteable heading, use this heading and the text after. 
    }

    For example if the product is a AIS receiver, ASR x50, the following could be a suitable system prompt:
    {
        "introduction": {
        "heading": "Introduction",
        "text": "ASR x50 is the 4th generation SAT-AIS receiver from Kongsberg and part of the extended lifetime product series. The receiver is a reconfigurable SDR based receiver, designed to support simultaneous on-board AIS decoding and digital sampling. ASR x50 has, through new enhanced algorithms, multi-antenna support and superior dynamic range, an improved end-to-end performance. It is designed for a 7+ year lifetime and takes vessel detection via AIS to the next level."
        },
        "block1": {
        "heading": "Innovative technology",
        "text": "This generation SAT-AIS receiver from Kongsberg is the latest achievement of years of continuous innovations resulting in highest decoder performance, multi-antenna support, built-in redundancy, low power, miniaturized housing, large mass memory, and improved lifetime. The end-to-end performance exceeds existing SAT-AIS receivers, where the superior sensitivity of the ASR x50 makes the receiver capable of detecting even AIS class B vessels. Reconfigurable software-defined radio (SDR) technology is used, enabling support for future enhancements in algorithms or changes in AIS/VDES standards."
        },
        "block2": {
        "heading": "Vessel detection performance to the next level",
        "text": "Kongsberg started working with AIS twenty years ago and is the AIS equipment manufacturer with the broadest experience. ASR x50 is Kongsberg’s 4th generation AIS Space Receiver and builds on this foundation of expertise. A multiple set of decollision algorithms is optimized for the best possible vessel detection in high-density and medium-density areas. ASR x50 will give the end user a giant leap in vessel detection compared with existing SAT-AIS receivers."
        },
        "block3": {
        "heading": "Space grade using latest technologies",
        "text": "The extended lifetime series from Kongsberg is designed for a lifetime of 7 + years in LEO. ASR x50 uses the latest generation EEE parts from best-in-class manufacturers. This enables Kongsberg to design for leading capabilities at low power and miniature size. All EEE parts have been carefully selected and extensively tested. Active components have been subject to heavy ion, proton, and Co-60 test campaigns to ensure radiation-tolerant design."
        }
    }
    
    Return as Json with the same formats as always."""

#### Azure openAI

In [141]:
def generate_blocks_azure(text):
    import os
    from openai import OpenAI
    from openai import AzureOpenAI
    import pandas as pd


    deployment_name = "gpt-4-turbo-1106-preview"  # This will correspond to the custom name you chose for your deployment when you deployed a model.


    client = AzureOpenAI(
        azure_endpoint="https://da-openai-test.openai.azure.com/",
        api_key="5a90b85ba420469c9b36438e238d70fe",
        api_version="2023-05-15",
    )

    response = client.chat.completions.create(
        model=deployment_name,
        messages = [
                {
                    "role": "system",
                    "content": system_prompt_json_blocks
                },
                {
                    "role": "user",
                    "content": f"I have a text that I want to format into headings and text bulks. The text is:\n\n{text}\n\n Please format this text using the same words and languange as the text. Do not change the text to much, and do not add information that is not there. Do not headings such as features or technical specification. Always return as json."
                }
            ],
        response_format={"type": "json_object"},
        temperature=0.1,
        top_p=0.1,
    )

    return response.choices[0].message.content.strip()

#### OPENAI

In [142]:
def generate_blocks_openai(text):
    import streamlit as st
    import pandas as pd
    from openai._client import OpenAI

    client = OpenAI(
        api_key=st.secrets["openai"]["api_key"],
    )
    messages = [
            {
                "role": "system",
                "content": system_prompt_json_blocks
            },
            {
                "role": "user",
                "content": f"I have a text that I want to format into headings and text bulks. The text is:\n\n{text}\n\n Please format this text using the same words and languange as the text. Do not change the text to much, and do not add information that is not there. Do not headings such as features or technical specification. Always return as json."
            }
        ]

    response = client.chat.completions.create(
        # model="gpt-3.5-turbo-1106",
        model="gpt-4-1106-preview",
        response_format={ "type": "json_object" },
        messages=messages,
    )
    output_text = response.choices[0].message.content.strip()
    return output_text

### Iterating through file

In [143]:
# all_products_with_columns = pd.read_excel("data/all_products_dropped_discontinuid.xlsx", usecols=["Product_Name","Product category", "Features", "Technical Specifications", "url", "Data sheets", "downloads", "product_family_name", "is_range"])
# # save it as csv
# all_products_with_columns.to_csv("data/all_products_block123.csv", index=False)


In [144]:
# all_products_with_columns = pd.read_csv("data/all_products_block123.csv")

# # Initialize the new columns with empty strings
# all_products_with_columns['Introduction'] = ''
# all_products_with_columns['Block1'] = ''
# all_products_with_columns['Block2'] = ''
# all_products_with_columns['Block3'] = ''

# # Convert the columns to string dtype
# all_products_with_columns['Introduction'] = all_products_with_columns['Introduction'].astype(str)
# all_products_with_columns['Block1'] = all_products_with_columns['Block1'].astype(str)
# all_products_with_columns['Block2'] = all_products_with_columns['Block2'].astype(str)
# all_products_with_columns['Block3'] = all_products_with_columns['Block3'].astype(str)

# # Define the new order of the columns
# new_column_order = ['Product_Name', 'Product category', 'Introduction', 'Block1', 'Block2', 'Block3', 'Features', 'Technical Specifications', 'url', 'Data sheets', 'downloads', 'product_family_name', 'is_range']

# # Reorder the columns
# all_products_with_columns = all_products_with_columns.reindex(columns=new_column_order)

# # Now the 'Introduction', 'Block1', 'Block2', and 'Block3' columns will appear after 'Product category'

# all_products_with_columns.to_csv("data/all_products_block123.csv", index=False)


In [148]:
all_products_with_columns = pd.read_csv("data/all_products_block123.csv")

for index, row in all_products_with_columns.iterrows():
    if pd.isna(row["Introduction"]):
        print(index,"product_name:", row["Product_Name"] )

        if not pd.isna(row["Data sheets"]):
            input_text = scrape_pdf_text(row["Data sheets"])
            print(row["Data sheets"])
        else:
            input_text = extract_text(row["url"])
            print(row["url"])

        json_output = generate_blocks_openai(input_text)
        # Parse the JSON data
        data = json.loads(json_output)
        data = {k.lower(): v for k, v in data.items()}

        introduction = data.get('introduction', '')
        block1 = data.get('block1', '')
        block2 = data.get('block2', '')
        block3 = data.get('block3', '')  # Use an empty string as the default value

        print(introduction)
        print(block1)
        print(block2)
        print(block3)

        # Convert the columns to string dtype
        all_products_with_columns['Introduction'] = all_products_with_columns['Introduction'].astype(str)
        all_products_with_columns['Block1'] = all_products_with_columns['Block1'].astype(str)
        all_products_with_columns['Block2'] = all_products_with_columns['Block2'].astype(str)
        all_products_with_columns['Block3'] = all_products_with_columns['Block3'].astype(str)

        # Now you can assign the string values without getting a warning
        all_products_with_columns.loc[index, 'Introduction'] = str(introduction)
        all_products_with_columns.loc[index, 'Block1'] = str(block1)
        all_products_with_columns.loc[index, 'Block2'] = str(block2)
        all_products_with_columns.loc[index, 'Block3'] = str(block3)

        all_products_with_columns.to_csv("data/all_products_block123.csv", index=False)
        all_products_with_columns.to_excel("data/all_products_block123.xlsx", index=False)
all_products_with_columns.to_csv("data/all_products_block123.csv", index=False)
all_products_with_columns.to_excel("data/all_products_block123.xlsx", index=False)


71 product_name: EM 2040C MKII Multibeam echosounder, Max. 500 m
https://www.kongsberg.com/contentassets/94ea2adb75394001a54d852d0920b420/443809ad_em2040c_mk2_data_sheet_slim_pu.pdf
{'heading': 'Introduction', 'text': 'The EM 2040C MKII is a shallow water multibeam echo sounder based on EM 2040 technology. It is an ideal tool for any high resolution mapping and inspection application. With the release of the EM 2040 MKII series Kongsberg Maritime has upgraded the hardware and software to increase the swath and improve the data quality of our EM 2040 series.'}
{'heading': 'Key facts and Operating Performance', 'text': 'The system fulfils, and even surpasses, IHO-S44 Exclusive with one sonar head, allowing coverage of 5.5 times water depth and the more stringent LINZ specification. For a dual transducer system, 200° angular coverage or 10 times the water depth is achieved on a flat bottom. The operating frequency range is from 200 to 400 kHz with frequency selection in steps of 10 kHz. T