In [1]:
import pandas as pd
import json
import uuid

In [2]:
def load_jsonl_safely(filepath):
    data = []
    bad_lines = 0

    with open(filepath, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue  # Skip empty lines
            try:
                obj = json.loads(line)
                data.append(obj)
            except json.JSONDecodeError:
                print(f"❌ Skipping invalid JSON on line {i}: {line[:100]}...")
                bad_lines += 1

    print(f"✅ Loaded {len(data)} valid records. Skipped {bad_lines} bad lines.")
    return pd.DataFrame(data)

# Load your file
df = load_jsonl_safely("Content/all_canada_rag_content.jsonl")

# Now you can use it
print(df.head())
print(f"Shape: {df.shape}")

✅ Loaded 103552 valid records. Skipped 0 bad lines.
                     id                                                url  \
0  Alberta_1_1757152647  https://www.alberta.ca/careers-fisheries-manag...   
1  Alberta_2_1757152647  https://www.alberta.ca/carrier-profiles-and-mo...   
2  Alberta_3_1757152647     https://www.alberta.ca/careers-land-management   
3  Alberta_4_1757152648  https://www.alberta.ca/careers-wildlife-manage...   
4  Alberta_5_1757152648            https://www.alberta.ca/careers-agrology   

                                          title  \
0   Careers – Fisheries management | Alberta.ca   
1  Carrier profiles and monitoring | Alberta.ca   
2        Careers – Land management | Alberta.ca   
3    Careers – Wildlife management | Alberta.ca   
4               Careers – Agrology | Alberta.ca   

                                         description  \
0  A career in fisheries management offers many o...   
1  Supports carriers' internal monitoring of on-r...   
2  T

In [30]:
df.sample(3)

Unnamed: 0,id,url,title,description,content,province,timestamp,content_length,language,source,document_type
68973,Ontario_1329_1757168643,https://www.ontario.ca/document/policy-directi...,Government of Ontario,,We’re receiving a higher volume of requests fr...,Ontario,1757169000.0,611,fr,canada_gov,html
13325,British_Columbia_7808_1757142579,https://www2.gov.bc.ca/gov/content/industry/ra...,Wildfire recovery - Province of British Columbia,"Wildfires have damaged range lands, recovery i...",Wildfire recovery efforts in the Cariboo Regio...,British_Columbia,1757143000.0,832,fr,canada_gov,html
34316,Canada_281_1758455948,http://canada.ca/en/index.html,Home - Canada.ca,,Canada.ca\n\nThe official website of the Gover...,Canada,1758456000.0,1920,en,canada_gov,


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103552 entries, 0 to 103551
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              103552 non-null  object 
 1   url             103552 non-null  object 
 2   title           102903 non-null  object 
 3   description     71669 non-null   object 
 4   content         103552 non-null  object 
 5   province        103552 non-null  object 
 6   timestamp       103552 non-null  float64
 7   content_length  103552 non-null  int64  
 8   language        103552 non-null  object 
 9   source          103552 non-null  object 
 10  document_type   88703 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 8.7+ MB


In [38]:
df1 =  df.drop_duplicates(subset=['url'])

In [39]:
canada_df = df1[df1['id'].str.contains('Canada', case=False)]

In [40]:
canada_df.reset_index(inplace=True)

In [41]:
canada_df

Unnamed: 0,index,id,url,title,description,content,province,timestamp,content_length,language,source,document_type
0,15436,Canada_5_1758454485,https://www.canada.ca/aboutgov-ausujetgouv/dep...,Departments and agencies - Canada.ca,,Departments and agencies\n\nEnter a department...,Canada,1.758454e+09,8452,en,canada_gov,html
1,15437,Canada_1_1758454485,https://www.canada.ca/,Canada.ca,,Canada.ca\n/ Gouvernement du Canada\nGovernmen...,Canada,1.758454e+09,160,en,canada_gov,html
2,15438,Canada_2_1758454485,http://canada.ca/,Canada.ca,,Canada.ca\n/ Gouvernement du Canada\nGovernmen...,Canada,1.758454e+09,160,en,canada_gov,html
3,15439,Canada_3_1758454485,https://www.canada.ca/?ref=university.heavnn.io,Canada.ca,,Canada.ca\n/ Gouvernement du Canada\nGovernmen...,Canada,1.758454e+09,160,en,canada_gov,html
4,15440,Canada_9_1758454487,https://canada.ca/accueil.html,Accueil - Canada.ca,,Canada.ca\n\nLe site officiel du gouvernement ...,Canada,1.758454e+09,2294,en,canada_gov,html
...,...,...,...,...,...,...,...,...,...,...,...,...
25866,43896,Canada_8171_1758479609,https://www.canada.ca/fr/ministere-defense-nat...,Manuel sur la politique des congés des Forces ...,,Manuel de la politique des congés des Forces c...,Canada,1.758480e+09,298,fr,canada_gov,
25867,43897,Canada_8173_1758479609,https://ncc-ccn.gc.ca/podcast,Capital Stories: An NCC podcast series | Natio...,,You are here: Home Capital Stories: An NCC pod...,Canada,1.758480e+09,3616,en,canada_gov,
25868,43898,Canada_8180_1758479610,https://www.guichetemplois.gc.ca/analyse-tenda...,Explorer le marché du travail au Canada - Guic...,,Explorer le marché du travail au Canada\n\nAcc...,Canada,1.758480e+09,1323,en,canada_gov,
25869,43900,Canada_8181_1758479616,https://tc.canada.ca/en/aviation/reference-cen...,Advisory Circular (AC) No. 302-015,,Advisory Circular (AC) No. 302-015\n\nFrom: Tr...,Canada,1.758480e+09,12621,en,canada_gov,


In [42]:
canada_df = canada_df.drop(["index"], axis=1)

In [43]:
canada_df

Unnamed: 0,id,url,title,description,content,province,timestamp,content_length,language,source,document_type
0,Canada_5_1758454485,https://www.canada.ca/aboutgov-ausujetgouv/dep...,Departments and agencies - Canada.ca,,Departments and agencies\n\nEnter a department...,Canada,1.758454e+09,8452,en,canada_gov,html
1,Canada_1_1758454485,https://www.canada.ca/,Canada.ca,,Canada.ca\n/ Gouvernement du Canada\nGovernmen...,Canada,1.758454e+09,160,en,canada_gov,html
2,Canada_2_1758454485,http://canada.ca/,Canada.ca,,Canada.ca\n/ Gouvernement du Canada\nGovernmen...,Canada,1.758454e+09,160,en,canada_gov,html
3,Canada_3_1758454485,https://www.canada.ca/?ref=university.heavnn.io,Canada.ca,,Canada.ca\n/ Gouvernement du Canada\nGovernmen...,Canada,1.758454e+09,160,en,canada_gov,html
4,Canada_9_1758454487,https://canada.ca/accueil.html,Accueil - Canada.ca,,Canada.ca\n\nLe site officiel du gouvernement ...,Canada,1.758454e+09,2294,en,canada_gov,html
...,...,...,...,...,...,...,...,...,...,...,...
25866,Canada_8171_1758479609,https://www.canada.ca/fr/ministere-defense-nat...,Manuel sur la politique des congés des Forces ...,,Manuel de la politique des congés des Forces c...,Canada,1.758480e+09,298,fr,canada_gov,
25867,Canada_8173_1758479609,https://ncc-ccn.gc.ca/podcast,Capital Stories: An NCC podcast series | Natio...,,You are here: Home Capital Stories: An NCC pod...,Canada,1.758480e+09,3616,en,canada_gov,
25868,Canada_8180_1758479610,https://www.guichetemplois.gc.ca/analyse-tenda...,Explorer le marché du travail au Canada - Guic...,,Explorer le marché du travail au Canada\n\nAcc...,Canada,1.758480e+09,1323,en,canada_gov,
25869,Canada_8181_1758479616,https://tc.canada.ca/en/aviation/reference-cen...,Advisory Circular (AC) No. 302-015,,Advisory Circular (AC) No. 302-015\n\nFrom: Tr...,Canada,1.758480e+09,12621,en,canada_gov,


In [44]:
def id_fixed(id):

  index_count = str(canada_df.index[canada_df['id'] == str(id)].tolist()[0])

  list_id = id.split('_')
  list_id[1] = index_count

  return "_".join(list_id)

id_fixed("Canada_8173_1758479609")

'Canada_25867_1758479609'

In [45]:
canada_df["id"] = canada_df["id"].apply(id_fixed)

In [46]:
canada_df

Unnamed: 0,id,url,title,description,content,province,timestamp,content_length,language,source,document_type
0,Canada_0_1758454485,https://www.canada.ca/aboutgov-ausujetgouv/dep...,Departments and agencies - Canada.ca,,Departments and agencies\n\nEnter a department...,Canada,1.758454e+09,8452,en,canada_gov,html
1,Canada_1_1758454485,https://www.canada.ca/,Canada.ca,,Canada.ca\n/ Gouvernement du Canada\nGovernmen...,Canada,1.758454e+09,160,en,canada_gov,html
2,Canada_2_1758454485,http://canada.ca/,Canada.ca,,Canada.ca\n/ Gouvernement du Canada\nGovernmen...,Canada,1.758454e+09,160,en,canada_gov,html
3,Canada_3_1758454485,https://www.canada.ca/?ref=university.heavnn.io,Canada.ca,,Canada.ca\n/ Gouvernement du Canada\nGovernmen...,Canada,1.758454e+09,160,en,canada_gov,html
4,Canada_4_1758454487,https://canada.ca/accueil.html,Accueil - Canada.ca,,Canada.ca\n\nLe site officiel du gouvernement ...,Canada,1.758454e+09,2294,en,canada_gov,html
...,...,...,...,...,...,...,...,...,...,...,...
25866,Canada_25866_1758479609,https://www.canada.ca/fr/ministere-defense-nat...,Manuel sur la politique des congés des Forces ...,,Manuel de la politique des congés des Forces c...,Canada,1.758480e+09,298,fr,canada_gov,
25867,Canada_25867_1758479609,https://ncc-ccn.gc.ca/podcast,Capital Stories: An NCC podcast series | Natio...,,You are here: Home Capital Stories: An NCC pod...,Canada,1.758480e+09,3616,en,canada_gov,
25868,Canada_25868_1758479610,https://www.guichetemplois.gc.ca/analyse-tenda...,Explorer le marché du travail au Canada - Guic...,,Explorer le marché du travail au Canada\n\nAcc...,Canada,1.758480e+09,1323,en,canada_gov,
25869,Canada_25869_1758479616,https://tc.canada.ca/en/aviation/reference-cen...,Advisory Circular (AC) No. 302-015,,Advisory Circular (AC) No. 302-015\n\nFrom: Tr...,Canada,1.758480e+09,12621,en,canada_gov,


In [47]:
df_without_canada = df1[~df1['id'].str.contains('Canada', case=False)]

In [48]:
df_combined = pd.concat([df_without_canada, canada_df], ignore_index=True)

In [49]:
df_combined.sample(5)

Unnamed: 0,id,url,title,description,content,province,timestamp,content_length,language,source,document_type
11438,British_Columbia_5897_1757140937,https://www2.gov.bc.ca/gov/content/health/prac...,8.7 Capitation Fees for Plan B (Long-term Care...,8.7 Capitation Fees for Plan B (Residential Care),* \n\ncovers B.C. residents who are permanen...,British_Columbia,1757141000.0,4616,fr,canada_gov,html
81709,Canada_8066_1758463670,https://www.canada.ca/en/immigration-refugees-...,Notice – Adoptions from Pakistan - Canada.ca,,Notice – Adoptions from Pakistan\n\nThe provin...,Canada,1758464000.0,954,en,canada_gov,html
94987,Canada_21344_1758461931,https://www.diigo.com/post?title=Find+a+job+-+...,Sign in to Diigo | Diigo,,Sign in diigo\nSign in\nForgot password?\n\nSi...,Canada,1758462000.0,172,en,canada_gov,
40909,Ontario_1768_1757168915,https://www.ontario.ca/foodland/recipes/strawb...,Government of Ontario,,We’re receiving a higher volume of requests fr...,Ontario,1757169000.0,611,fr,canada_gov,html
66070,Quebec_9207_1758390865,https://www.quebec.ca/tourisme-et-loisirs/aide...,Programme Équipe Québec | Gouvernement du Québec,Aide financière offerte aux athlètes qui parti...,"1. [Aides financières en sport, loisir, acti...",Quebec,1758391000.0,1491,fr,canada_gov,html


In [50]:
df_combined['document_type'] = df_combined['document_type'].fillna('html')

In [51]:
df_combined.drop_duplicates(subset=['url'], inplace=True)

In [52]:
from langdetect import detect, LangDetectException

def detect_language(text: str, fallback="en") -> str:
    try:
        lang = detect(text.strip()[:500])
        return "fr" if lang == "fr" else "en"
    except:
        return fallback

In [53]:
df_combined['language'] = df_combined["content"].apply(detect_language)

In [54]:
df_combined.sample(5)

Unnamed: 0,id,url,title,description,content,province,timestamp,content_length,language,source,document_type
54854,Prince_Edward_Island_1536_1757192170,https://www.princeedwardisland.ca/en/informati...,400 Bad Request,,Your browser sent a request that this server c...,Prince_Edward_Island,1757192000.0,120,en,canada_gov,html
29894,New_Brunswick_13076_1757190903,https://laws.gnb.ca/en/version/cs/P-5.05?code=...,Disposition versions,,If either an employer or a bargaining agent ha...,New_Brunswick,1757191000.0,770,en,canada_gov,html
88403,Canada_14760_1758368985,https://open.canada.ca/en/search/ati?ati%5B0%5...,Completed Access to Information Requests | Ope...,About this information Search the summaries of...,Search the summaries of completed Access to In...,Canada,1758369000.0,4041,en,canada_gov,html
59103,Quebec_pdf_408_1758363751,https://cdn-contenu.quebec.ca/cdn-contenu/adm/...,FORM-Fr_SAJMA_MJQ.pdf,Government PDF document,Page 1 de 13\nMinistère de la JusticeMJQ-905 1...,Quebec,1758364000.0,34439,fr,canada_gov,pdf
52160,Ontario_13066_1757179430,https://www.ontario.ca/page/covid-19-workplace...,COVID-19 and workplace health and safety | ont...,Get information and tools to help you protect ...,Get information and tools to help you protect ...,Ontario,1757179000.0,1524,en,canada_gov,html


In [55]:
from urllib.parse import urlparse

def extract_source_from_url(url: str) -> str:
    domain = urlparse(url).netloc.lower()
    # Remove 'www.' and '.gc.ca' / '.ca' suffixes
    if domain.startswith("www."):
        domain = domain[4:]
    if domain.endswith(".gc.ca"):
        domain = domain[:-6]  # e.g., "tc.gc.ca" → "tc"
    elif domain.endswith(".ca"):
        domain = domain[:-3]  # e.g., "alberta.ca" → "alberta"
    return domain.replace(".", "_") + "_gov"  # e.g., "ncc_ccn" → "ncc_ccn_gov"



In [56]:
df_combined['source'] = df_combined['url'].apply(extract_source_from_url)

In [57]:
df_combined

Unnamed: 0,id,url,title,description,content,province,timestamp,content_length,language,source,document_type
0,Alberta_1_1757152647,https://www.alberta.ca/careers-fisheries-manag...,Careers – Fisheries management | Alberta.ca,A career in fisheries management offers many o...,A career in fisheries management offers many o...,Alberta,1.757153e+09,4794,en,alberta_gov,html
1,Alberta_2_1757152647,https://www.alberta.ca/carrier-profiles-and-mo...,Carrier profiles and monitoring | Alberta.ca,Supports carriers' internal monitoring of on-r...,"\n\n\n\n * Effective December 2020, the Roads...",Alberta,1.757153e+09,5891,en,alberta_gov,html
2,Alberta_3_1757152647,https://www.alberta.ca/careers-land-management,Careers – Land management | Alberta.ca,There are many rewarding career paths for land...,The Lands Division is responsible for providin...,Alberta,1.757153e+09,6423,en,alberta_gov,html
3,Alberta_4_1757152648,https://www.alberta.ca/careers-wildlife-manage...,Careers – Wildlife management | Alberta.ca,A career in wildlife management offers opportu...,A career in Wildlife Management offers a contr...,Alberta,1.757153e+09,3919,en,alberta_gov,html
4,Alberta_5_1757152648,https://www.alberta.ca/careers-agrology,Careers – Agrology | Alberta.ca,Agrologists can find meaningful work managing ...,Consider a career that enables you to make a d...,Alberta,1.757153e+09,4653,en,alberta_gov,html
...,...,...,...,...,...,...,...,...,...,...,...
99509,Canada_25866_1758479609,https://www.canada.ca/fr/ministere-defense-nat...,Manuel sur la politique des congés des Forces ...,,Manuel de la politique des congés des Forces c...,Canada,1.758480e+09,298,fr,canada_gov,html
99510,Canada_25867_1758479609,https://ncc-ccn.gc.ca/podcast,Capital Stories: An NCC podcast series | Natio...,,You are here: Home Capital Stories: An NCC pod...,Canada,1.758480e+09,3616,en,ncc-ccn_gov,html
99511,Canada_25868_1758479610,https://www.guichetemplois.gc.ca/analyse-tenda...,Explorer le marché du travail au Canada - Guic...,,Explorer le marché du travail au Canada\n\nAcc...,Canada,1.758480e+09,1323,fr,guichetemplois_gov,html
99512,Canada_25869_1758479616,https://tc.canada.ca/en/aviation/reference-cen...,Advisory Circular (AC) No. 302-015,,Advisory Circular (AC) No. 302-015\n\nFrom: Tr...,Canada,1.758480e+09,12621,en,tc_canada_gov,html


In [58]:
df_combined.to_csv("combined_canada_rag_cleaned.csv", index=False, encoding="utf-8")
df_combined.to_json("combined_canada_rag_cleaned.jsonl", orient="records", lines=True, force_ascii=False)