**Purpose:**
This program is used to extract the list of Designated Learning Institutions (DLIs) from the official website of Canada. The DLI data is one of the input files in the "Study in Canada" dashboard found in the Projects/Power BI folder. 

In [1]:
# Import module/s
import requests 
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.canada.ca/en/immigration-refugees-citizenship/services/study-canada/study-permit/prepare/designated-learning-institutions-list.html"
page = requests.get(url)

soup = BeautifulSoup(page.content, "html.parser")
# soup

In [2]:
# Get all possible values of province/territory codes
prov_mapping = soup.find("div",class_="mwstext section")
print(prov_mapping)

<div class="mwstext section"><div id="dli-dropdown"></div>
<div class="wb-frmvld hidden" id="ffctrl">
<form method="get">
<div class="wb-fieldflow" data-wb-fieldflow='{"noForm":true,"noreqlabel":true,"base":{"live":true},"action":"toggle","prop":"toggle","hideelm":"#ab,#bc,#mb,#nb,#nfld,#nt,#ns,#nu,#on,#qc,#pei,#sk,#yk", "unhideelm":"#ffctrl"}'>
<p>View list by province or territory</p>
<ul>
<li data-wb-fieldflow="#ab">Alberta</li>
<li data-wb-fieldflow="#bc">British Columbia</li>
<li data-wb-fieldflow="#mb">Manitoba</li>
<li data-wb-fieldflow="#nb">New Brunswick</li>
<li data-wb-fieldflow="#nfld">Newfoundland and Labrador</li>
<li data-wb-fieldflow="#nt">Northwest Territories</li>
<li data-wb-fieldflow="#ns">Nova Scotia</li>
<li data-wb-fieldflow="#nu">Nunavut</li>
<li data-wb-fieldflow="#on">Ontario</li>
<li data-wb-fieldflow="#qc">Quebec</li>
<li data-wb-fieldflow="#pei">Prince Edward Island</li>
<li data-wb-fieldflow="#sk">Saskatchewan</li>
<li data-wb-fieldflow="#yk">Yukon</li>
</

In [3]:
# Create a dictionary of provinces/territories and their corresponding codes 
prov_dict = {
'ab' : 'Alberta',
'bc' : 'British Columbia',
'mb' : 'Manitoba',
'nb' : 'New Brunswick',
'nfld' : 'Newfoundland and Labrador',
'nt' : 'Northwest Territories',
'ns' : 'Nova Scotia',
'nu' : 'Nunavut',
'on' : 'Ontario',
'qc' : 'Quebec',
'pei' : 'Prince Edward Island',
'sk' : 'Saskatchewan',
'yk' : 'Yukon'
}

# Create an empty dataframe for the data
dli_data = pd.DataFrame({'province':[],
    'name_of_institution':[],
    'dli_number':[],
    'city':[],
    'campuses':[],
    'offers_pgwp_programs':[]})
        
# Loop through all the provinces/territories in the dictionary
for prov_key in prov_dict:
    results = soup.find(id=prov_key)    # Filter a province/territory
    table_data = results.find_all('tbody')  # Get the contents of the table
    
    for table_element in table_data:
        data = table_element.find_all('td') # Find all data cells
        index = 0
        record_num = 1
        
        # BC has no column for campuses
        if prov_key =='bc':
            no_of_records = len(data)/4 # Table has 4 columns
            while record_num <= no_of_records:
                name_of_institution = data[index+0].text.strip()    # Extract the name of institution
                dli_number = data[index+1].text.strip()             # Extract the DLI number
                city = data[index+2].text.strip()                   # Extract the name of city
                campuses = ''                                       # Extract the location/s of campus/es
                offers_pgwp_programs = data[index+3].text.strip()   # Extract the tagging for PGWP offer
                
                # Create a dictionary for all extracted data
                record_data = {'province':[prov_dict[prov_key]],
                            'name_of_institution':[name_of_institution],
                            'dli_number':[dli_number],
                            'city':[city],
                            'campuses':[campuses],
                            'offers_pgwp_programs':[offers_pgwp_programs]}
                
                # Append the current record to the main dataframe
                dli_data = pd.concat([dli_data,pd.DataFrame(record_data)],ignore_index=True)
                
                # Increment index to move to the next record
                record_num += 1
                index += 4

        # For all other provinces/territories
        else:
            no_of_records = len(data)/5 # Table has 5 columns
            while record_num <= no_of_records:
                name_of_institution = data[index+0].text.strip()    # Extract the name of institution
                dli_number = data[index+1].text.strip()             # Extract the DLI number
                city = data[index+2].text.strip()                   # Extract the name of city
                campuses = data[index+3].text.strip()               # Extract the location/s of campus/es
                offers_pgwp_programs = data[index+4].text.strip()   # Extract the tagging for PGWP offer
                
                # Create a dictionary for all extracted data
                record_data = {'province':[prov_dict[prov_key]],
                            'name_of_institution':[name_of_institution],
                            'dli_number':[dli_number],
                            'city':[city],
                            'campuses':[campuses],
                            'offers_pgwp_programs':[offers_pgwp_programs]}
                
                # Append the current record to the main dataframe
                dli_data = pd.concat([dli_data,pd.DataFrame(record_data)],ignore_index=True)
                
                # Increment index to move to the next record
                record_num += 1
                index += 5
            
print(dli_data.head())
print(dli_data.shape)

  province                          name_of_institution     dli_number  \
0  Alberta                                   GD College  O281802098042   
1  Alberta                       Global Village Calgary  O278527539112   
2  Alberta                 Prairie Western College Inc.  O278565506082   
3  Alberta                  Peace River Bible Institute  O147371918072   
4  Alberta  Visual College of Art and Design of Calgary  O269471149722   

       city  campuses offers_pgwp_programs  
0   Calgary   Calgary                   No  
1   Calgary   Calgary                   No  
2   Calgary   Calgary                   No  
3  Sexsmith  Sexsmith                   No  
4   Calgary   Calgary                   No  
(1621, 6)


In [4]:
# Convert values of PGWP tagging (for dashboard creation)
def ConvertYes(inVal):
    if inVal == "Yes" or inVal == "No":
        return inVal
    else:
        return "With Limitations"

# Add a column for the updated PGWP tagging    
dli_data['PGWP_Tag'] = dli_data['offers_pgwp_programs'].apply(ConvertYes)
dli_data.head()

Unnamed: 0,province,name_of_institution,dli_number,city,campuses,offers_pgwp_programs,PGWP_Tag
0,Alberta,GD College,O281802098042,Calgary,Calgary,No,No
1,Alberta,Global Village Calgary,O278527539112,Calgary,Calgary,No,No
2,Alberta,Prairie Western College Inc.,O278565506082,Calgary,Calgary,No,No
3,Alberta,Peace River Bible Institute,O147371918072,Sexsmith,Sexsmith,No,No
4,Alberta,Visual College of Art and Design of Calgary,O269471149722,Calgary,Calgary,No,No


In [5]:
from datetime import datetime

# Add extraction timestamp
# dli_data['extraction_date'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
dli_data['extraction_date'] = datetime.now()
print(dli_data.head())

# Export as CSV file
dli_data.to_csv('DLI Data.csv', index=False)

  province                          name_of_institution     dli_number  \
0  Alberta                                   GD College  O281802098042   
1  Alberta                       Global Village Calgary  O278527539112   
2  Alberta                 Prairie Western College Inc.  O278565506082   
3  Alberta                  Peace River Bible Institute  O147371918072   
4  Alberta  Visual College of Art and Design of Calgary  O269471149722   

       city  campuses offers_pgwp_programs PGWP_Tag            extraction_date  
0   Calgary   Calgary                   No       No 2023-04-26 22:29:09.112874  
1   Calgary   Calgary                   No       No 2023-04-26 22:29:09.112874  
2   Calgary   Calgary                   No       No 2023-04-26 22:29:09.112874  
3  Sexsmith  Sexsmith                   No       No 2023-04-26 22:29:09.112874  
4   Calgary   Calgary                   No       No 2023-04-26 22:29:09.112874  


**Data Source:**
- https://www.canada.ca/en/immigration-refugees-citizenship/services/study-canada/study-permit/prepare/designated-learning-institutions-list.html

**Reference:**
- https://realpython.com/beautiful-soup-web-scraper-python/#step-2-scrape-html-content-from-a-page

**Installations:**
- requests               2.28.2
- beautifulsoup4         4.12.0
- pandas                 1.5.3