In [1]:
import pandas as pd
import numpy as np

import config
import openai

from urllib.request import urlopen
from bs4 import BeautifulSoup

openai.api_key = config.OPENAI_key

pd.set_option('display.max_colwidth', None)

In [2]:
# function def -- this function gets the content of the Assistant response
def ChatCompletion_content(
        messages = [],
        model = "gpt-3.5-turbo",
        temperature = 0,
):
    response = openai.ChatCompletion.create(
        model = model,
        temperature = temperature,
        messages = messages
    )

    return(response.get("choices")[0].get("message").get("content"))

In [3]:
# load the sq1 URLs into a dataframe and clean it
df = pd.read_csv(r"C:\Users\Hooman Deghani\OneDrive\PC\Desktop\Input\gpt-3.5-turbo\xml_sitemap_urls.csv")

df = df.drop(columns= ['Unnamed: 0', 'lastmod', 'image', 'changefreq', 'priority'])

df.loc[:, ['home/auto', 'type', 'location', 'topic-product', 'topic-article', 'subtopic-article']] = ''

df = df.rename(columns= {"loc": "url"})

df.shape

(1376, 7)

In [84]:
# fill the dataframe home/auto column
df.loc[:, 'home/auto'] = 'home'

# fill the type column
filt = df.loc[:, 'url'].str.contains("/resource-centres|/centres-ressources")
df.loc[filt, 'type'] = 'article'

df.loc[~filt, 'type'] = 'product'

# fill the topic-product column
    # fill home and tenant
product_home = df.loc[:, 'url'].str.contains("/home/|/fr/habitation")
product_tenant = df.loc[:, 'url'].str.contains("/tenant/|/fr/locataire")
df.loc[product_home, 'topic-product'] = 'home'
df.loc[product_tenant, 'topic-product'] = 'tenant'

    # fill other
df.loc[(~product_home) & (~product_tenant) & (~filt), 'topic-product'] = 'other'

    # fill ns
career = df.loc[:, 'url'].str.contains('/careers/|/fr/carrieres')
news = df.loc[:, 'url'].str.contains('/news/|/fr/actualites')
support = df.loc[:, 'url'].str.contains('/support/|/soutien/')

df.loc[(career) | (news) | (support) | (filt), 'topic-product'] = 'ns'
df.sample(10)

# fill the 'topic-article' column
    # home insurance basics; french and english. /insurance-basics or /bases-assurance
basics = df.loc[:, 'url'].str.contains('/insurance-basics/|/bases-assurance/')
df.loc[basics, 'topic-article'] = 'home insurance basics'

    # Insurance glossary; /insurance-glossary/ or /lexique-assurance/
glossary = df.loc[:, 'url'].str.contains('/insurance-glossary/|/lexique-assurance/')
df.loc[glossary, 'topic-article'] = 'insurance glossary'

    # home & personal safety; /home-personal-safety/ or /securite-domiciliaire-personnelle/
safety = df.loc[:, 'url'].str.contains('/home-personal-safety/|/securite-domiciliaire-personnelle')
df.loc[safety, 'topic-article'] = 'home-personal-safety'

    # home buying selling moving;
moving = df.loc[:, 'url'].str.contains('/home-buying-selling-moving/|/achat-vente-demenagement-habitation/')
df.loc[moving, 'topic-article'] = 'home-buying-selling-moving'

    # home improvement
improvement = df.loc[:, 'url'].str.contains('/home-improvement/|/ameliorations-domiciliaires/')
df.loc[improvement, 'topic-article'] = 'home-improvement'

    # Getting to know your home
know = df.loc[:, 'url'].str.contains('/getting-to-know-your-home/|/connaitre-votre-habitation/')
df.loc[know, 'topic-article'] = 'getting-to-know-your-home'
df.loc[know].head(10)

Unnamed: 0,url,home/auto,type,location,topic-product,topic-article,subtopic-article
1106,https://www.squareone.ca/resource-centres/getting-to-know-your-home/guide-copper-pipes-plumbing,home,article,,ns,getting-to-know-your-home,
1107,https://www.squareone.ca/resource-centres/getting-to-know-your-home/decks-and-patios,home,article,,ns,getting-to-know-your-home,
1108,https://www.squareone.ca/resource-centres/getting-to-know-your-home/aluminum-wiring,home,article,,ns,getting-to-know-your-home,
1109,https://www.squareone.ca/resource-centres/getting-to-know-your-home/bidets,home,article,,ns,getting-to-know-your-home,
1110,https://www.squareone.ca/resource-centres/getting-to-know-your-home/basements,home,article,,ns,getting-to-know-your-home,
1111,https://www.squareone.ca/resource-centres/getting-to-know-your-home/your-windows,home,article,,ns,getting-to-know-your-home,
1112,https://www.squareone.ca/resource-centres/getting-to-know-your-home/sump-pump,home,article,,ns,getting-to-know-your-home,
1113,https://www.squareone.ca/resource-centres/getting-to-know-your-home/refrigerator,home,article,,ns,getting-to-know-your-home,
1114,https://www.squareone.ca/resource-centres/getting-to-know-your-home/gutters-and-downspouts,home,article,,ns,getting-to-know-your-home,
1115,https://www.squareone.ca/resource-centres/getting-to-know-your-home/your-doors,home,article,,ns,getting-to-know-your-home,


In [None]:
    4- Return the topic of article.<dic> dic = {{
        "home insurance basics": "There is '/insurance-basics' in {url}.",
        "home insurance glossary": "There is'/insurance-glossary' in {url}.",
        "home safety": "There is '/home-personal-safety' in {url}.",
        "personal safety": "There is '/home-personal-safety' in {url}.",
        "home buying": "There is '/home-buying-selling-moving' in {url}.",
        "home selling": "There is '/home-buying-selling-moving' in {url}.",
        "moving": "There is '/home-buying-selling-moving' in {url}.",
        "homeowners": "There is '/homeowner' in {url}.",
        "home improvement": "There is '/home-improvement' in {url}.",
        "getting to know your home": "There is '/getting-to-know-your-home' in {url}.",
        "condo owners": "There is '/condo-owner' in {url}.",
        "interior design": "There is '/interior-design' in {url}.",
        "landlords": "There is '/landlord' in {url}.",
        "renters": "There is '/renter' in {url}.",
        "templates": "There is '/template' in {url}.",
    }} </dic> \

In [5]:
# have gpt fill location, and subtopic-article based on text and url

In [6]:
# function def: extract main text from a url
def extract_text(url): 

    html = urlopen(url).read()
    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    return(text)

text = extract_text("https://www.squareone.ca/resource-centres/home-buying-selling-moving/buying-a-home-for-the-first-time")
url = "https://www.squareone.ca/resource-centres/home-buying-selling-moving/buying-a-home-for-the-first-time"

text = text.replace("""Read related articles
Check out these related articles:
12 tips for decorating your shelves16 ideas for designing a stylish and functional mudroom16+ living room design ideas for your home19+ home office design ideas19+ laundry room design ideas for your home
Get a free quote
Get a personalized online home insurance quote in just 5 minutes and see how much money you can save by switching to Square One.
Get an online quote now
Protect your family
Even when you take precautions, accidents can happen. Home insurance is one way to protect your family against financial losses from accidents. And, home insurance can start from as little as $12/month.
Learn more
English
Français
Call us
MON - SAT
5:00 AM - 6:00 PM
1.855.331.6933
Available
Company
About us
Why us
Reviews
Careers
Contact us
Resources
Helpful articles
Common questions
News + media
Report a claim
Service concerns
Legal
Terms of use
Privacy policy
Transparency
Licenses + underwriters
Site map
Insurance is sold by Square One Insurance Services (1410-650 W Georgia St, Vancouver, BC V6B 4N8). Home insurance is underwritten by The Mutual Fire Insurance Company of British Columbia. Legal protection insurance (not sold in Quebec) is underwritten by HDI Global Specialty SE.""", "")
text = text[len(text)//2:]
text = text[len(text)//2:]

In [7]:
# define the types dictionary
dic = {
    # Always return "home" for type 1 without exception.
    "Type1": {"home": "", "auto": "", "ns": "Webpage does not discuss any type of home or auto insurance."},
    # Type2: Type2 does not have a "ns" option.
    "Type2": {"product page": "url does not have 'resource-centres' in it.", "article": "url has 'resource centre in it'"},
    # Type3: If the webpage is a product page, it falls under one of the following categories. If it's an article, choose "ns".
    "Type3": {"tenant": "product page has 'tenant' in the url.", "home": "product page has 'home' in the url.", "other": "Includes condo, rental, detached house, content, etc.", "ns": "not specified; for pages that don’t discusss any type of residential insurance at all"},
    # Type4: Location: These are the provinces we sell our products in. If the webpage does not talk about a specific province, select "ns".
    "Type4": {"BC": "", "Ontario": "", "Quebec": "", "Alberta": "", "Saskatchewan": "", "Manitoba": "", "ns": "not specified; the webpage could talk about the entirety of Canada, or not be location-specific at all."},
    # Type5 : Make your selection for type5 based on the url. 
    "Type5": {
        "home insurance basics": "home insurance basics articles have 'insurance-basics' in the url.",
        "home insurance glossary": "glossary articles almost always have 'insurance-glossary' in the url.",
        "home safety": "home safety articles have 'home-personal-safety' in the url.",
        "personal safety": "personal safety articles have 'home-personal-safety' in the url.",
        "home buying": "home buying articles have 'home-buying-selling-moving' in the url.",
        "home selling": "home selling articles have 'home-buying-selling-moving' in the url.",
        "moving": "moving articles have 'home-buying-selling-moving' in the url.",
        "homeowners": "homeowner articles have 'homeowner' in the url.",
        "home improvement": "home improvement articles have 'home-improvement' in the url.",
        "getting to know your home": "getting to know your home articles have 'getting-to-know-your-home' in the url.",
        "condo owners": "condo owner articles have 'condo-owner' in the url.",
        "interior design": "interior design articles have 'interior-design' in the url.",
        "landlords": "landlords articles have 'landlord' in the url.",
        "renters": "renters articles have 'renter' in the url.",
        "templates": "template articles have 'template' in the url.",
        "ns": "for pages that are not articles select ns."
    },
    # Subtype5: The specific type of the article. Each specific type falls under a general type taken from Type5.
    "Subtype5": {
        # for each of type5 general topics above, select a specific topic from the list and insert it in Subtype5.
        "home insurance basics": ["premiums", "deductible", "coverage types", "loss types", "specialty coverages"],
        "home insurance glossary": ["Coverage terms", "Losses + Claims terms", "Insurance industry terms"],
        "home safety": ["Theft", "Fire safety", "Earthquakes", "Flood", "Water damage", "Pests", "Storms", "Winter", "Guns"],
        "personal safety": ["Digital safety", "Theft"],
        "home buying": [
            "home inspection",
            "recreational properties",
            "fees",
            "mortgage",
            "first-time home buyers",
            "inheriting a home",
            "realtors",
            "log homes",
            "latent and patent defects Quebec",
            "General tips",
        ],
        "home selling": ["fees", "realtors"],
        "moving": [],
        "homeowners": [
            "vacant dwellings",
            "neighbors",
            "garden",
            "pool",
            "insurance",
            "legal",
            "Airbnb",
            "Seasons (Winter, Summer, etc.)",
            "Sustainable homes",
        ],
        "home improvement": ["home improvement", "home maintenance"],
        "getting to know your home": [
            "heating",
            "electrical panels",
            "water leak detection system",
            "backwater valves",
            "furnaces",
            "wiring",
            "basement",
            "driveway",
            "gutters",
            "pipes and plumbing",
            "drain systems",
            "floor",
            "attics",
            "toilet",
            "windows",
            "walls",
            "water heaters",
            "doors",
            "septic tanks",
            "refrigerator",
            "decks and patios",
        ],
        "condo owners": [
            "Co-ownership",
            "Common Property",
            "Basics",
        ],
        "interior design": [
            "Home office",
            "Bedroom",
            "Shelves",
            "Living room",
            "Feng Shui",
            "Holidays",
            "Laundry room",
            "Kitchen",
            "Floor",
            "Couches",
            "Mudroom",
            "Walls",
            "Yard, patio, balconies",
            "Bathroom",
        ],
        "landlords": ["Before finding tenants", "Repairs", "Insurance", "Leases"],
        "renters": [
            "Before finding a place",
            "Eviction",
            "Insurance",
            "Students",
            "Roommates",
            "Theft",
            "Security",
        ],
        "templates": ["Any article whose primary purpose is to provide a template to the reader has its Subtype5 categorized as templates"],
        "ns": "if the page is not an article, select 'ns' for Subtype5."
    },
}


In [8]:
# draft for system message
prompt = f"""
    I want to have a directory of all URLs in our website. In this directory, each URL is labeled by a variety of 'types' according to different categories it falls into.
    There are 5 categories: Type 1, Type 2, Type 3, Type 4, and Subtopic.

    For each step you will receive all, a selection of, or none of a url, the contents of a webpage, a list, a dictionary. 

    Now you will receive the step by step instructions. The step by step instructions are delimited in triple backticks (```).

    ```
    Your job is to go through the following steps one by one and return an output for each step.

    1- Return if the webpage is a product page or an article. Make your decision based on the contents of the following list and the {url}. <li> list = ["article", "product page"] </li> If {url} has exactly "/resource-centres" in it, the webpage is an article. Otherwise, it's a product page. \
    Return an item from the list given to you in this step delimited by xml tags <li> and </li>. Print out your reasoning on why you selected this item. Put your answer for this step in xml tags titled <1> </1>.

    2- Detrmine the type of product page.<dic> dic = {{"tenant": "webpage has 'tenant' in {url}.", "home": "webpage has 'home' in {url}.", "other": "Includes condo, rental, detached house, content, etc."}} </dic> \
    If you selected product page in the step 1, return a value from the dictionary delimited in this step in <dic></dic>. The required information for each key is given as the value of the dictionary. If you selected article in the previous step, return "ns". Print out your reasoning on why you selected this item. Put your answer for this step in xml tags titled <2> </2>.

    3- Return the location of the page.<li> list = ["BC", "Ontario", "Quebec", "Alberta", "Saskatchewan", "Manitoba"] </li> If the article or product page is specific to a province in Canada, \
    return an item from the list delimited in <li></li> in this step. If the webpage is not specific to a province, return "ns". Print out your reasoning on why you selected this item. Put your answer for this step in xml tags titled <3> </3>.

    4- Return the topic of article.<dic> dic = {{
        "home insurance basics": "There is '/insurance-basics' in {url}.",
        "home insurance glossary": "There is'/insurance-glossary' in {url}.",
        "home safety": "There is '/home-personal-safety' in {url}.",
        "personal safety": "There is '/home-personal-safety' in {url}.",
        "home buying": "There is '/home-buying-selling-moving' in {url}.",
        "home selling": "There is '/home-buying-selling-moving' in {url}.",
        "moving": "There is '/home-buying-selling-moving' in {url}.",
        "homeowners": "There is '/homeowner' in {url}.",
        "home improvement": "There is '/home-improvement' in {url}.",
        "getting to know your home": "There is '/getting-to-know-your-home' in {url}.",
        "condo owners": "There is '/condo-owner' in {url}.",
        "interior design": "There is '/interior-design' in {url}.",
        "landlords": "There is '/landlord' in {url}.",
        "renters": "There is '/renter' in {url}.",
        "templates": "There is '/template' in {url}.",
    }} </dic> \
    If you selected "product page" in step 1, return "ns". Otherwise, return an output from dic based on the explanation given for each key as the value of that key. Print out your reasoning on why you selected this item. Put your answer for this step in xml tags titled <4> </4>.

    5- Return the subtopic of article.
    You will make your decision for this step based on the contents of the webpage, which is delimited in triple stars (***):
    ***
    Text = {text}
    ***
      
        <dic> dic = {{
        "home insurance basics": ["premiums", "deductible", "coverage types", "loss types", "specialty coverages"],
        "home insurance glossary": ["Coverage terms", "Losses + Claims terms", "Insurance industry terms"],
        "home safety": ["Theft", "Fire safety", "Earthquakes", "Flood", "Water damage", "Pests", "Storms", "Winter", "Guns"],
        "personal safety": ["Digital safety", "Theft"],
        "home buying": [
            "home inspection",
            "recreational properties",
            "fees",
            "mortgage",
            "first-time home buyers",
            "inheriting a home",
            "realtors",
            "log homes",
            "latent and patent defects Quebec",
            "General tips",
        ],
        "home selling": ["fees", "realtors"],
        "moving": [],
        "homeowners": [
            "vacant dwellings",
            "neighbors",
            "garden",
            "pool",
            "insurance",
            "legal",
            "Airbnb",
            "Seasons (Winter, Summer, etc.)",
            "Sustainable homes",
        ],
        "home improvement": ["home improvement", "home maintenance"],
        "getting to know your home": [
            "heating",
            "electrical panels",
            "water leak detection system",
            "backwater valves",
            "furnaces",
            "wiring",
            "basement",
            "driveway",
            "gutters",
            "pipes and plumbing",
            "drain systems",
            "floor",
            "attics",
            "toilet",
            "windows",
            "walls",
            "water heaters",
            "doors",
            "septic tanks",
            "refrigerator",
            "decks and patios",
        ],
        "condo owners": [
            "Co-ownership",
            "Common Property",
            "Basics",
        ],
        "interior design": [
            "Home office",
            "Bedroom",
            "Shelves",
            "Living room",
            "Feng Shui",
            "Holidays",
            "Laundry room",
            "Kitchen",
            "Floor",
            "Couches",
            "Mudroom",
            "Walls",
            "Yard, patio, balconies",
            "Bathroom",
        ],
        "landlords": ["Before finding tenants", "Repairs", "Insurance", "Leases"],
        "renters": [
            "Before finding a place",
            "Eviction",
            "Insurance",
            "Students",
            "Roommates",
            "Theft",
            "Security",
        ],
        "templates": ["Any article whose primary purpose is to provide a template to the reader has its Subtype5 categorized as templates"],
    }} </dic> \
    Based on the Topic of the article from step 4, *TEXT*, and the dic just given, you will find the subtopic of the article. The topics are the keys in the dic, and the possible subtopics for each topic are given as a list as the value of the key. \
    If you selected product page in the first step, return "ns". Print out your reasoning on why you selected this item. Put your answer for this step in xml tags titled <5> </5>.

    6- Return a summary output. Now you will return a summary of all outputs in the form of a python dictionary. Put your answer for this step in xml tags titled <6> </6>
    Here are two example of what the final output will look like:
    <example1>
    URL: https://www.squareone.ca/home
    text: The content of the page reveals that the page is a product page selling home insurance.
    {{
        "Type 1": "product page",
        "Type 2": "home",
        "Type 3": "BC",
        "Type 4": "ns",
        "Subtopic": "ns"
    }}  
    </example1>

    <example2>
    URL: https://www.squareone.ca/resource-centres/home-improvement/home-maintenance-schedules
    text: The content of the page reveals that the page is a guide on preparing long-term and short-term home maintenance schedules.
    {{
        "Type 1": "article",
        "Type 2": "ns",
        "Type 3": "ns",
        "Type 4": "home improvement",
        "Subtopic": "home maintenance"
    }}    
    </example2>
    ```
"""