In [1]:
import pandas as pd
import numpy as np

import config
import openai

from urllib.request import urlopen
from bs4 import BeautifulSoup

openai.api_key = config.OPENAI_key

pd.set_option('display.max_colwidth', None)

In [2]:
# function def -- this function gets the content of the Assistant response
def ChatCompletion_content(
        messages = [],
        model = "gpt-3.5-turbo",
        temperature = 0,
):
    response = openai.ChatCompletion.create(
        model = model,
        temperature = temperature,
        messages = messages
    )

    return(response.get("choices")[0].get("message").get("content"))

In [3]:
# load the sq1 URLs into a dataframe and clean it
df = pd.read_csv(r"C:\Users\Hooman Deghani\OneDrive\PC\Desktop\Input\gpt-3.5-turbo\xml_sitemap_urls.csv")

df = df.drop(columns= ['Unnamed: 0', 'lastmod', 'image', 'changefreq', 'priority'])

df.loc[:, ['Type1', 'Type2', 'Type3', 'Type4', 'Type5', 'Subtype5']] = ''

df = df.rename(columns= {"loc": "url"})

df.shape

(1376, 7)

In [4]:
# function def: extract main text from a url
def extract_text(url): 

    html = urlopen(url).read()
    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    return(text)

text = extract_text("https://www.squareone.ca/resource-centres/home-buying-selling-moving/buying-a-home-for-the-first-time")
url = "https://www.squareone.ca/resource-centres/home-buying-selling-moving/buying-a-home-for-the-first-time"

text = text.replace("""Read related articles
Check out these related articles:
12 tips for decorating your shelves16 ideas for designing a stylish and functional mudroom16+ living room design ideas for your home19+ home office design ideas19+ laundry room design ideas for your home
Get a free quote
Get a personalized online home insurance quote in just 5 minutes and see how much money you can save by switching to Square One.
Get an online quote now
Protect your family
Even when you take precautions, accidents can happen. Home insurance is one way to protect your family against financial losses from accidents. And, home insurance can start from as little as $12/month.
Learn more
English
Français
Call us
MON - SAT
5:00 AM - 6:00 PM
1.855.331.6933
Available
Company
About us
Why us
Reviews
Careers
Contact us
Resources
Helpful articles
Common questions
News + media
Report a claim
Service concerns
Legal
Terms of use
Privacy policy
Transparency
Licenses + underwriters
Site map
Insurance is sold by Square One Insurance Services (1410-650 W Georgia St, Vancouver, BC V6B 4N8). Home insurance is underwritten by The Mutual Fire Insurance Company of British Columbia. Legal protection insurance (not sold in Quebec) is underwritten by HDI Global Specialty SE.""", "")
text = text[len(text)//2:]
text = text[len(text)//2:]
text = text[len(text)//2:]

In [5]:
# define the types dictionary
dic = {
    # Always return "home" for type 1 without exception.
    "Type1": {"home": "", "auto": "", "ns": "Webpage does not discuss any type of home or auto insurance."},
    # Type2: Type2 does not have a "ns" option.
    "Type2": {"product page": "url does not have 'resource-centres' in it.", "article": "url has 'resource centre in it'"},
    # Type3: If the webpage is a product page, it falls under one of the following categories. If it's an article, choose "ns".
    "Type3": {"tenant": "product page has 'tenant' in the url.", "home": "product page has 'home' in the url.", "other": "Includes condo, rental, detached house, content, etc.", "ns": "not specified; for pages that don’t discusss any type of residential insurance at all"},
    # Type4: Location: These are the provinces we sell our products in. If the webpage does not talk about a specific province, select "ns".
    "Type4": {"BC": "", "Ontario": "", "Quebec": "", "Alberta": "", "Saskatchewan": "", "Manitoba": "", "ns": "not specified; the webpage could talk about the entirety of Canada, or not be location-specific at all."},
    # Type5 : Make your selection for type5 based on the url. 
    "Type5": {
        "home insurance basics": "home insurance basics articles have 'insurance-basics' in the url.",
        "home insurance glossary": "glossary articles almost always have 'insurance-glossary' in the url.",
        "home safety": "home safety articles have 'home-personal-safety' in the url.",
        "personal safety": "personal safety articles have 'home-personal-safety' in the url.",
        "home buying": "home buying articles have 'home-buying-selling-moving' in the url.",
        "home selling": "home selling articles have 'home-buying-selling-moving' in the url.",
        "moving": "moving articles have 'home-buying-selling-moving' in the url.",
        "homeowners": "homeowner articles have 'homeowner' in the url.",
        "home improvement": "home improvement articles have 'home-improvement' in the url.",
        "getting to know your home": "getting to know your home articles have 'getting-to-know-your-home' in the url.",
        "condo owners": "condo owner articles have 'condo-owner' in the url.",
        "interior design": "interior design articles have 'interior-design' in the url.",
        "landlords": "landlords articles have 'landlord' in the url.",
        "renters": "renters articles have 'renter' in the url.",
        "templates": "template articles have 'template' in the url.",
        "ns": "for pages that are not articles select ns."
    },
    # Subtype5: The specific type of the article. Each specific type falls under a general type taken from Type5.
    "Subtype5": {
        # for each of type5 general topics above, select a specific topic from the list and insert it in Subtype5.
        "home insurance basics": ["premiums", "deductible", "coverage types", "loss types", "specialty coverages"],
        "home insurance glossary": ["Coverage terms", "Losses + Claims terms", "Insurance industry terms"],
        "home safety": ["Theft", "Fire safety", "Earthquakes", "Flood", "Water damage", "Pests", "Storms", "Winter", "Guns"],
        "personal safety": ["Digital safety", "Theft"],
        "home buying": [
            "home inspection",
            "recreational properties",
            "fees",
            "mortgage",
            "first-time home buyers",
            "inheriting a home",
            "realtors",
            "log homes",
            "latent and patent defects Quebec",
            "General tips",
        ],
        "home selling": ["fees", "realtors"],
        "moving": [],
        "homeowners": [
            "vacant dwellings",
            "neighbors",
            "garden",
            "pool",
            "insurance",
            "legal",
            "Airbnb",
            "Seasons (Winter, Summer, etc.)",
            "Sustainable homes",
        ],
        "home improvement": ["home improvement", "home maintenance"],
        "getting to know your home": [
            "heating",
            "electrical panels",
            "water leak detection system",
            "backwater valves",
            "furnaces",
            "wiring",
            "basement",
            "driveway",
            "gutters",
            "pipes and plumbing",
            "drain systems",
            "floor",
            "attics",
            "toilet",
            "windows",
            "walls",
            "water heaters",
            "doors",
            "septic tanks",
            "refrigerator",
            "decks and patios",
        ],
        "condo owners": [
            "Co-ownership",
            "Common Property",
            "Basics",
        ],
        "interior design": [
            "Home office",
            "Bedroom",
            "Shelves",
            "Living room",
            "Feng Shui",
            "Holidays",
            "Laundry room",
            "Kitchen",
            "Floor",
            "Couches",
            "Mudroom",
            "Walls",
            "Yard, patio, balconies",
            "Bathroom",
        ],
        "landlords": ["Before finding tenants", "Repairs", "Insurance", "Leases"],
        "renters": [
            "Before finding a place",
            "Eviction",
            "Insurance",
            "Students",
            "Roommates",
            "Theft",
            "Security",
        ],
        "templates": ["Any article whose primary purpose is to provide a template to the reader has its Subtype5 categorized as templates"],
        "ns": "if the page is not an article, select 'ns' for Subtype5."
    },
}


In [9]:
# draft for system message
prompt = f"""
    I want to have a directory of all URLs in our website. In this directory, each URL is labeled by a variety of 'types' according to different categories it falls into.
    There are 6 categories, named Type1, Type2, Type3, Type4, Type5, and Subtype5. Each type contains different labels. Each url will have one of these labels for each of these types. 

    You will receive three components for a URL: the URL itself, the text content of the URL, and a python dictionary containing all the types and the labels inside.
    The dictionary contains the necessary explanations for you to make the right selection. The explanations for each type are given as a python comment above the said type, and the explanations for type1 to type5 keys
    are given as the values of those keys.

    The URL is delimited in triple hashtags (###):
    ###
    URL = {url}
    ###

    Data: The dictionary is delimited in triple backticks(```):
    ```
    Ditionary = {dic}
    ```

    The text for the URL is delimited in triple stars (***):
    ***
    Text = {text}
    ***

    Step by step on what to do: 
    1- Based on the instructions in the Dictionary, the URL, and the text of the URL, return a python dictionary with the following format: {{"Type1": "", "Type2": "", "Type3": "", "Type4": "", "Type5": "", "Subtype5": ""}}. Next, print your reasoning for why you selected each category. delimit your response for this step in xml tags named <1></1>. 
    2- Based on the instructions in the Dictionary, the URL, the text of the URL, and your reasoning in step 1, return a python dictionary with the following format: {{"Type1": "", "Type2": "", "Type3": "", "Type4": "", "Type5": "", "Subtype5": ""}} delimit your response for this step in xml tags named <2></2>. Remember; for type1 to type5 select a key from the dictionary. For subtype5 select an item from the list which is the value for the keys in the dictionary. No need to provide your reasoning for step 2. Simply provide the python dictionary.

    Few shot examples:
    Here are two example outputs delimited in xml tags:
    <example1>
    URL: https://www.squareone.ca/home
    text: The content of the page is over 1000 tokens long. It reveals that the page is a product page selling home insurance.
    <1>
    {{
        "Type1": "home",
        "Type2": "product page",
        "Type3": "other",
        "Type4": "BC",
        "Type5": "ns",
        "Subtype5": "ns"
    }}
    </1>
    <2>
    {{
        "Type1": "home",
        "Type2": "product page",
        "Type3": "home",
        "Type4": "BC",
        "Type5": "ns",
        "Subtype5": "ns"
    }}  
    </example1>

    <example2>
    URL: https://www.squareone.ca/resource-centres/home-improvement/home-maintenance-schedules
    text: The content of the page is over 1500 tokens long. It reveals that the page is a guide on preparing long-term and short-term home maintenance schedules.
    <1>
    {{
        "Type1": "home",
        "Type2": "article",
        "Type3": "ns",
        "Type4": "ns",
        "Type5": "home improvement",
        "Subtype5": "home improvement"
    }}
    </1>
    <2>
    {{
        "Type1": "home",
        "Type2": "article",
        "Type3": "ns",
        "Type4": "ns",
        "Type5": "home improvement",
        "Subtype5": "home maintenance"
    }}    
    </example2>
    <
"""

In [11]:
response = ChatCompletion_content(
    messages = [
        {"role": "user", "content": f"{prompt}"}
    ]
)

# remove the instructions in the dictionary and feed them to gpt as steps. he will make the selection for each step one by one, like this:
# 1- step 1: select the type of the webpage. delimit your answer in an xml tag titled <type1></type1>. Example: <type1>article</type1> Make a selection from the keys in the following dictionary: Type 1 = {"home": "", "auto"}

print(response)

<1>
Based on the instructions in the Dictionary, the URL, and the text of the URL, the selected categories are as follows:

Type1: "home" - The URL contains "home" in it.
Type2: "article" - The URL has "resource centre" in it.
Type3: "other" - The URL does not contain any specific keywords for this category.
Type4: "BC" - The URL does not contain any specific keywords for this category.
Type5: "ns" - The URL does not contain any specific keywords for this category.
Subtype5: "ns" - The URL does not contain any specific keywords for this category.

</1>
<2>
{
    "Type1": "home",
    "Type2": "article",
    "Type3": "other",
    "Type4": "BC",
    "Type5": "ns",
    "Subtype5": "ns"
}
</2>


In [8]:
print(len(text))

6248
