In [1]:
# imports and options

import pandas as pd
import numpy as np

import config
import openai
import re
import sys

from urllib.request import urlopen
from bs4 import BeautifulSoup

openai.api_key = config.OPENAI_key_sq1

pd.set_option('display.max_colwidth', None)

In [2]:
# function def -- this function gets the content of the Assistant response
def ChatCompletion_content(
        messages = [],
        model = "gpt-3.5-turbo",
        temperature = 0,
):
    response = openai.ChatCompletion.create(
        model = model,
        temperature = temperature,
        messages = messages
    )

    return(response.get("choices")[0].get("message").get("content"))

In [3]:
# load the sq1 URLs into a dataframe and clean it
df = pd.read_csv(r"C:\Users\Hooman Deghani\OneDrive\PC\Desktop\Input\gpt-3.5-turbo\xml_sitemap_urls.csv")

# Remove columns containing "unnamed"
unnamed = [column for column in df.columns if "Unnamed" in column]
df = df.drop(columns=unnamed)
df

#df = df.drop(columns= ['Unnamed: 0', 'lastmod', 'image', 'changefreq', 'priority'])

#df.loc[:, ['home/auto', 'type', 'location', 'topic-product', 'topic-article', 'subtopic-article']] = ''

#df = df.rename(columns= {"loc": "url"})

Unnamed: 0,url,home/auto,type,location,topic-product,topic-article,subtopic-article
0,https://www.squareone.ca/,home,product,ns,other,ns,ns
1,https://www.squareone.ca/news/the-vancouver-sun-article,home,product,BC,ns,ns,ns
2,https://www.squareone.ca/apartment,home,product,ns,other,ns,ns
3,https://www.squareone.ca/residential,home,product,ns,other,ns,ns
4,https://www.squareone.ca/house/bc/quote,home,product,BC,other,ns,ns
...,...,...,...,...,...,...,...
1371,https://www.squareone.ca/about-us/team-members,home,product,ns,other,ns,ns
1372,https://www.squareone.ca/careers/licensed-insurance-agent-remote-bc,home,product,BC,ns,ns,ns
1373,https://www.squareone.ca/careers/remote-damage-insurance-agent,home,product,ns,ns,ns,ns
1374,https://www.squareone.ca/home-insurance-claims-suppliers,home,product,ns,other,ns,ns


In [4]:
# fill programmatically

# fill the dataframe home/auto column
df.loc[:, 'home/auto'] = 'home'

# fill the type column
filt = df.loc[:, 'url'].str.contains("/resource-centres|/centres-ressources")
df.loc[filt, 'type'] = 'article'

df.loc[~filt, 'type'] = 'product'

# fill the topic-product column
    # fill home and tenant
product_home = df.loc[:, 'url'].str.contains("/home/|/fr/habitation")
product_tenant = df.loc[:, 'url'].str.contains("/tenant/|/fr/locataire")
df.loc[product_home, 'topic-product'] = 'home'
df.loc[product_tenant, 'topic-product'] = 'tenant'

    # fill other
df.loc[(~product_home) & (~product_tenant) & (~filt), 'topic-product'] = 'other'

    # fill ns
career = df.loc[:, 'url'].str.contains('/careers/|/fr/carrieres')
news = df.loc[:, 'url'].str.contains('/news/|/fr/actualites')
support = df.loc[:, 'url'].str.contains('/support/|/soutien/')

df.loc[(career) | (news) | (support) | (filt), 'topic-product'] = 'ns'

# fill the 'topic-article' column
    # home insurance basics; french and english. /insurance-basics or /bases-assurance
basics = df.loc[:, 'url'].str.contains('/insurance-basics/|/bases-assurance/')
df.loc[basics, 'topic-article'] = 'home-insurance-basics'

    # Insurance glossary; /insurance-glossary/ or /lexique-assurance/
glossary = df.loc[:, 'url'].str.contains('/insurance-glossary/|/lexique-assurance/')
df.loc[glossary, 'topic-article'] = 'insurance-glossary'

    # home & personal safety; /home-personal-safety/ or /securite-domiciliaire-personnelle/
safety = df.loc[:, 'url'].str.contains('/home-personal-safety/|/securite-domiciliaire-personnelle')
df.loc[safety, 'topic-article'] = 'home-personal-safety'

    # home buying selling moving;
moving = df.loc[:, 'url'].str.contains('/home-buying-selling-moving/|/achat-vente-demenagement-habitation/')
df.loc[moving, 'topic-article'] = 'home-buying-selling-moving'

    # home improvement
improvement = df.loc[:, 'url'].str.contains('/home-improvement/|/ameliorations-domiciliaires/')
df.loc[improvement, 'topic-article'] = 'home-improvement'

    # getting to know your home
know = df.loc[:, 'url'].str.contains('/getting-to-know-your-home/|/connaitre-votre-habitation/')
df.loc[know, 'topic-article'] = 'getting-to-know-your-home'

    # homeowners
homeowners = df.loc[:, 'url'].str.contains('/homeowner/|/proprietaire-occupant')
df.loc[(filt) & homeowners, 'topic-article'] = 'homeowner' 

    # condo owner
condo_owner = df.loc[:, 'url'].str.contains('/condo-owner/|/coproprietaire/')
df.loc[condo_owner, 'topic-article'] = 'condo-owner'

    # interior design
interior_design = df.loc[:, 'url'].str.contains('/interior-design/|/decoration-interieure/')
df.loc[interior_design, 'topic-article'] = 'interior-design'

    # landlord
landlord = df.loc[:, 'url'].str.contains('/landlord/|/proprietaire-bailleur/')
df.loc[(filt) & landlord, 'topic-article'] = 'landlord'

    # renter
renter = df.loc[:, 'url'].str.contains('/renter/|/locataire/')
df.loc[(filt) & renter, 'topic-article'] = 'renter'

    # template
template = df.loc[:, 'url'].str.contains('/template/|/modele/')
df.loc[template, 'topic-article'] = 'template'

    # non-articles
df.loc[(~filt), 'topic-article'] = 'ns'

    # hub pages
hubs = df.loc[:, 'topic-article'] == ""
df.loc[hubs, 'topic-article'] = "ns"

    # TODO: Manually fix other section to ns (news, press releases, etc.)

In [5]:
def extract_text(url, ignore_ids=[]):
    html = urlopen(url).read()
    soup = BeautifulSoup(html, features="html.parser")

    # Find all elements with the specified 'id' attributes and remove them
    if ignore_ids:
        for ignore_id in ignore_ids:
            elements_to_ignore = soup.find_all(id=ignore_id)
            for element in elements_to_ignore:
                element.decompose()

    # **NEW ENCHANTMENT**: Banish the contents of the <head> section
    for head_element in soup.find_all('head'):
        head_element.decompose()

    # **ADDITIONAL SPELL**: Erase sections bearing the jumbotron sigil in their class
    for jumbotron_section in soup.find_all(class_=re.compile(".*jumbotron.*")):
        jumbotron_section.decompose()

    # **YET ANOTHER SPELL**: Remove sections with the class "col-12 col-lg6 offset-lg-1"
    for unwanted_section in soup.find_all(class_="col-12 col-lg-6 offset-lg-1"):
        unwanted_section.decompose()

    for unwanted_row in soup.find_all(class_="row mb-5"):
        unwanted_row.decompose()

    for bs_section in soup.find_all(class_=["lead px-lg-5 text-white", "col-12 col-md-8 offset-md-2 px-lg-5 pb-5 pb-lg-0", "policies bg-shape overflow-hidden", "footer-cta bg-shape mt-6", "col-12 text-white bg-darker py-5"]):
        bs_section.decompose()

    # Kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out

    # Get text
    text = soup.get_text()

    # Break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # Break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # Drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

removal = """Get a free quote
Get a personalized online home insurance quote in just 5 minutes and see how much money you can save by switching to Square One.
Get an online quote now
Protect your family
Even when you take precautions, accidents can happen. Home insurance is one way to protect your family against financial losses from accidents. And, home insurance can start from as little as $12/month.
Learn more
English
Français
Call us
MON - SAT
5:00 AM - 6:00 PM
1.855.331.6933
Available
Company
About us
Why us
Reviews
Careers
Contact us
Resources
Helpful articles
Common questions
News + media
Report a claim
Service concerns
Legal
Terms of use
Privacy policy
Transparency
Licenses + underwriters
Site map
Insurance is sold by Square One Insurance Services (1410-650 W Georgia St, Vancouver, BC V6B 4N8). Home insurance is underwritten by The Mutual Fire Insurance Company of British Columbia. Legal protection insurance (not sold in Quebec) is underwritten by HDI Global Specialty SE."""




In [6]:
dic  = {
        "home-insurance-basics": ["premiums", "deductible", "coverage types", "loss types", "specialty coverages"],
        "insurance-glossary": ["Coverage terms", "Losses + Claims terms", "Insurance industry terms"],
        "home-personal-safety": ["Theft", "Fire safety", "Earthquakes", "Flood", "Water damage", "Pests", "Storms", "Winter", "Guns", "Digital safety", "Theft"],
        "home-buying-selling-moving": ["home inspection", "recreational properties", "fees", "mortgage", "first-time home buyers", "inheriting a home", "realtors", "log homes", "latent and patent defects Quebec", "General tips", "fees", "realtors"],
        "homeowner": [
            "vacant dwellings",
            "neighbors",
            "garden",
            "pool",
            "insurance",
            "legal",
            "Airbnb",
            "Seasons (Winter, Summer, etc.)",
            "Sustainable homes",
        ],
        "home-improvement": ["home improvement", "home maintenance"],
        "getting-to-know-your-home": [
            "heating",
            "electrical panels",
            "water leak detection system",
            "backwater valves",
            "furnaces",
            "wiring",
            "basement",
            "driveway",
            "gutters",
            "pipes and plumbing",
            "drain systems",
            "floor",
            "attics",
            "toilet",
            "windows",
            "walls",
            "water heaters",
            "doors",
            "septic tanks",
            "refrigerator",
            "decks and patios",
        ],
        "condo-owner": [
            "Co-ownership",
            "Common Property",
            "Basics",
        ],
        "interior-design": [
            "Home office",
            "Bedroom",
            "Shelves",
            "Living room",
            "Feng Shui",
            "Holidays",
            "Laundry room",
            "Kitchen",
            "Floor",
            "Couches",
            "Mudroom",
            "Walls",
            "Yard, patio, balconies",
            "Bathroom",
        ],
        "landlord": ["Before finding tenants", "Repairs", "Insurance", "Leases"],
        "renter": [
            "Before finding a place",
            "Eviction",
            "Insurance",
            "Students",
            "Roommates",
            "Theft",
            "Security",
        ],
}


In [13]:
removal_2 = """4.7 out of 5. That’s how customers rate Square One Insurance Services after writing 44,951 reviews. And with an A+ rating from the Better Business Bureau, people clearly love the way we’re changing home insurance.
Digital platformWith Square One, you can get a quote, buy a policy and even submit a claim – all online, from the comfort of your own home. Quotes and policies are managed through your online account, but if you need help, expert assistance from one of our licensed insurance agents is just a webchat or phone call away.Superior protectionAll policies from Square One represent the highest level of protection currently available in Canada. We make it easy for you to protect your home against common perils (such as water damage and theft) and we even cover the costs of rebuilding your home to meet the latest bylaw or code changes.Guaranteed building replacement coverageGet the peace of mind you deserve. Simply insure your home to Square One’s suggested limit and, if you experience a loss, we’ll rebuild your home, even if the cost of doing so exceeds your limit of insurance.Personalized coverageAll policies protect most common personal property items including laptops, smartphones and furniture. If you own specialty property (such as bicycles or fine arts) you can also add coverage to your policy. That way, you only pay to insure what you actually own.Need to make a claim?Making a claim with Square One is easy. In fact, most of the process can be done online. You’ll get your own dedicated adjuster to work with you from start to finish to make sure you’re back on your feet in no time. And, Square One pays out on over 90% of all claims submitted.Interest-free monthly paymentsWe’re confident you’ll love our service and stay with us. That’s why we don’t lock you into a contract. With us, you can pay monthly or annually and there’s no fixed-term contract. You can also change your payment preferences through your online account.
It only takes 5 minutes
ready for an online quote? Your time matters, and so does your stuff. Get a personalized home insurance quote in 5 minutes. That’s less time than it takes to wait in line for coffee.
Get an online quote now
What you really want to know
How are claims handled?
The insurance industry doesn’t have the greatest reputation when it comes to claims. Many people fear their insurer will go to any length possible to avoid paying claims. At Square One, we’re different. We’re there when you need us most.
How different? Legal jargon can be confusing, so we’ve created a transparency page to translate the legal language of your policy into plain English.
We’re committed to changing how you think of insurance. We’ll make sure your claim is handled quickly and fairly.
See Our Transparency Page
The right protection from day one
The biggest problem with making a claim is not having the right protection. With a policy that’s tailored to your needs, you get the best protection possible.
Two-hour emergency response
We guarantee a fast response when you need it most. Not an emergency? No problem. Your adjuster will still be in touch within one business day. it’s your home, and your stuff, so we keep you in the loop.
One point of contact
A single dedicated adjuster works with you from start to finish. Without the need to repeat the details of your claim, you’ll be back on your feet in no time.
Thousands choose us every month
Recently purchased policies
you’re in good company
Join over 2,000,000 customers who received home insurance quotes from us
Get a quote
Find out how affordable personalized home insurance can be in 5 minutes with an online quote.

English
Français
Call us
MON - SAT
5:00 AM - 6:00 PM
1.855.331.6933
Available
Company
About us
Why us
Reviews
Careers
Contact us
Resources
Helpful articles
Common questions
News + media
Report a claim
Service concerns
Legal
Terms of use
Privacy policy
Transparency
Licenses + underwriters
Site map
Insurance is sold by Square One Insurance Services (1410-650 W Georgia St, Vancouver, BC V6B 4N8). Home insurance is underwritten by The Mutual Fire Insurance Company of British Columbia. Legal protection insurance (not sold in Quebec) is underwritten by HDI Global Specialty SE."""

removal_3 = """menu
Modern Home insurance
Get an online home insurance quote in 5 minutes.
Your time matters, and so does your stuff. Get a personalized home insurance quote in 5 minutes. That’s less time than it takes to wait in line for coffee.
Complete protection for your home and personal property"""

removal_4 = """Condo insurance
Homeowners insurance
Tenant insurance
Different than other providers
home insurance customers love and trust"""

removal_5 = """Insurance is sold by Square One Insurance Services (1410-650 W Georgia St, Vancouver, BC V6B 4N8). Home insurance is underwritten by The Mutual Fire Insurance Company of British Columbia. Legal protection insurance (not sold in Quebec) is underwritten by HDI Global Specialty SE."""

In [22]:
df.loc[1100:1120]

Unnamed: 0,url,home/auto,type,location,topic-product,topic-article,subtopic-article
1100,https://www.squareone.ca/fr/centres-ressources/locataire/cession-bail-quebec,home,article,Quebec,ns,renter,
1101,https://www.squareone.ca/fr/centres-ressources/locataire/souscrire-assurance-locataire,home,article,ns,ns,renter,
1102,https://www.squareone.ca/fr/a-propos-de-nous,home,product,ns,other,ns,ns
1103,https://www.squareone.ca/fr/a-propos-de-nous/experts/daniel-mirkovic,home,product,ns,other,ns,ns
1104,https://www.squareone.ca/resource-centres/home-personal-safety/securing-your-identity,home,article,ns,ns,home-personal-safety,
1105,https://www.squareone.ca/resource-centres/getting-to-know-your-home,home,article,ns,ns,ns,
1106,https://www.squareone.ca/resource-centres/getting-to-know-your-home/guide-copper-pipes-plumbing,home,article,ns,ns,getting-to-know-your-home,
1107,https://www.squareone.ca/resource-centres/getting-to-know-your-home/decks-and-patios,home,article,ns,ns,getting-to-know-your-home,
1108,https://www.squareone.ca/resource-centres/getting-to-know-your-home/aluminum-wiring,home,article,ns,ns,getting-to-know-your-home,
1109,https://www.squareone.ca/resource-centres/getting-to-know-your-home/bidets,home,article,ns,ns,getting-to-know-your-home,


In [24]:
# Fill subtopic-article with gpt-3.5

for i in range(1100, 1101):
    # if type != article then continue
    if df.loc[i, 'type'] != 'article':
        continue

    # grab url and clean text
    url = df.loc[i, 'url']

    text = extract_text(url, ["populate_recent_reviews", "mainnav"])
    text = text.replace(removal, "")

    if "Serving British Columbia, Alberta, Saskatchewan, Manitoba, Ontario and Quebec." in text:
        text = text.replace("Serving British Columbia, Alberta, Saskatchewan, Manitoba, Ontario and Quebec.", "")

    if removal_2 in text:
        text = text.replace(removal_2, "")

    if removal_3 in text:
        text = text.replace(removal_3, "")

    if removal_4 in text:
        text = text.replace(removal_4, "")

    if removal_5 in text:
        text = text.replace(removal_5, "")

    if "(Vancouver, BC): " in text:
        text = text.replace("(Vancouver, BC): ", "")

    if "in North America" in text:
        text = text.replace("in North America", "")

    while len(text) > 13000:
        text = text[:len(text)//2]

    # grab the subtopic list
    subtopics = dic.get(df.loc[i, 'topic-article'])
    print(df.loc[i, 'topic-article'], ": ", subtopics)

    # Prompt
    prompt = f"""
    Choose a topic for the text delimited in triple backticks (```) from the list delimited in triple hashtags (###):
    ```{text}```
    ###{subtopics}###
    """

    print(text)


renter :  ['Before finding a place', 'Eviction', 'Insurance', 'Students', 'Roommates', 'Theft', 'Security']
menu
Initiation à la cession de bail et à la sous-location au Québec
Mis à jour le 6 avril 2023
Vous venez de décrocher un travail de rêve dans une autre ville ou le condo que vous convoitez est finalement en vente : super! Vous avez un contrat de bail d’un an, vous dites?
Avant de refuser l’offre d’emploi ou d’abandonner le condo de vos rêves, respirez à fond. Dans cet article, nous expliquerons comment vous pouvez transférer (céder ou sous-louer) votre bail si vous avez besoin de déménager de votre habitation locative avant la fin du contrat de bail.
Nous aborderons les notions de cession de bail et de sous-location, ainsi que les répercussions en ce qui concerne votre assurance habitation. Lisez la suite pour découvrir comment briser votre bail… sans tout casser!
Thèmes
Qu’est-ce qu’un contrat de cession de bail?
Qu’est-ce que la sous-location?
Exceptions
Comment céder un cont

In [None]:
# Prompt draft

In [9]:
# # Fill Location via gpt-3.5

# # range(0, 1376)
# for i in range(1330, 1340):
#     url = df.loc[i, 'url']

#     text = extract_text(url, ["populate_recent_reviews", "mainnav"])
#     text = text.replace(removal, "")

#     if "Serving British Columbia, Alberta, Saskatchewan, Manitoba, Ontario and Quebec." in text:
#         text = text.replace("Serving British Columbia, Alberta, Saskatchewan, Manitoba, Ontario and Quebec.", "")

#     if removal_2 in text:
#         text = text.replace(removal_2, "")

#     if removal_3 in text:
#         text = text.replace(removal_3, "")

#     if removal_4 in text:
#         text = text.replace(removal_4, "")

#     if removal_5 in text:
#         text = text.replace(removal_5, "")

#     if "(Vancouver, BC): " in text:
#         text = text.replace("(Vancouver, BC): ", "")

#     if "in North America" in text:
#         text = text.replace("in North America", "")

#     while len(text) > 13000:
#         text = text[:len(text)//2]

#     prompt = f"""
#     Read the text delimited in triple backticks (```) and follow the steps that proceed it.\
#     ```
#     {text}
#     ```
#     1- Does the text mention a province or a possible city/town/village/community/district in Canada? If so, proceed to step 2. Otherwise skip to step 3. Print out your answer for this step in full in xml tags titled \
#     <step1></step1>
#     2- Based on your knowledge, which province is this city/town/village/community/district located in? Print out your answer in full in xml tags titled <step2></step2>.
#     3- If you concluded that the text is mentioning a city/town/village/community/district located in a Canadian province, return that province. Otherwise, return "ns". Delimit your answer in xml tags titled <province></province>.
#     """
#     response = ChatCompletion_content(
#         messages = [
#             {"role": "system", "content": "You are a helpful assistant."},
#             {"role": "user", "content": f"{prompt}.\n"}
#         ]
#     )
#     print(response)

    
    
#     result = re.search(r"<province>(.*?)</province>", response)
#     print(result.group(1), "\n")

#     df.loc[i, 'location'] = result.group(1)

In [10]:
df.loc[:, 'location'].isna().value_counts()

False    1376
Name: location, dtype: int64

In [11]:
# Save to csv file
df.to_csv(r"C:\Users\Hooman Deghani\OneDrive\PC\Desktop\Input\gpt-3.5-turbo\xml_sitemap_urls.csv")