# Teacher List Scrape

This notebook scrapes a list of teachers from directory pages on Insight Timer's website.

The resulting dataframe contains three columns:
- alpha_index: m
- dir_teacher_name: Malcolm Huxter
- teacher_id: malhuxter

The dataframe is saved to **teachers_list_df.csv**.

In [None]:
from selenium import webdriver
#import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import string
from datetime import datetime

In [None]:
# Insight Timer webpages use javascript, so need selenium and chrome driver.
chrome_driver_path = '../../../../Tech/chrome_driver/chromedriver.exe'

# Alternative Teacher Lists
https://insighttimer.com/meditation-teachers/ <br>
https://insighttimer.com/meditation-teachers/starts-with-k <br>
https://insighttimer.com/meditation-teachers/starts-with-k/1 <br>
https://insighttimer.com/meditation-teachers/starts-with-k/2 <br>
50 teachers per page <br>
Gives you the number of teachers for each letter. <br> 
Does not have "hash" and "other" teachers. <br>

In [None]:
directory_alpha_index = list(string.ascii_lowercase)
# Result is ['a', 'b', 'c', ... 'x', 'y', 'z']
# This approach does not have pages for 'hash' and 'more'

In [None]:
teachers_url = 'https://insighttimer.com/meditation-teachers/starts-with-'

In [None]:
#initialize lists which will become columns in a dataframe 
teacher_ids = []
alpha_indices = []

In [None]:
start_time = datetime.now()

# Iterate through teacher pages for each letter
for alpha_index in directory_alpha_index:
    
    alpha_page_num = 0
    continue_next_alpha_page = True
    
    while continue_next_alpha_page:

        # Create a new Chrome session with a custom executable path
        if alpha_page_num == 0:
            url = teachers_url + alpha_index
        else:
            url = teachers_url + alpha_index + '/' + str(alpha_page_num)

        #Create a session and load the page
        driver = webdriver.Chrome(executable_path=chrome_driver_path)
        driver.get(url)

        #Wait for page to fully load
        driver.implicitly_wait(3)

        #Make soup and close driver
        soup = BS(driver.page_source)
        driver.close()

        teacher_num = 0

        div_teacher_tags = soup.findAll('div', attrs = {'class':'MuiGrid-root w-1/2 xs:w-1/3 sm:w-1/3 md:w-1/5 lg:w-1/6 lg2:w-1/8 MuiGrid-item'})

        print(alpha_index,'page ',alpha_page_num,',',len(div_teacher_tags),'teachers')

        for teacher_div_tag in div_teacher_tags:

            teacher_num += 1

            #Find 'a' tag contained within 'div' tag
            teacher_a_tag = teacher_div_tag.find('a')
            #Set teacher_id = href attribute with first "/" character removed
            teacher_id = teacher_a_tag.get('href', default = '/no href')[1:]
            #Add new teacher_id to list 
            teacher_ids = teacher_ids + [teacher_id]

            #Add current alpha index to list
            alpha_indices = alpha_indices + [alpha_index]
        
        #Each page has 50 teachers maximum. If the current page has 50 teachers,
        #then there will likely be more teachers on the next page for the alpha_index.
        #The exception is when the alpha_index has a total number of teachers divisible 
        #by 50, in which case 50 teachers on the current page could be the last teachers
        #for the alpha_index. If the current page has less than 50 teachers, then 
        #those are the last teachers for the alpha_index, and there's no need to check
        #the next page.

        if teacher_num < 50:
            continue_next_alpha_page = False
        elif len(div_teacher_tags) == 0:
            continue_next_alpha_page = False
        else:
            alpha_page_num += 1

end_time = datetime.now()

In [None]:
#Print Runtime 
runtime = end_time - start_time
hours, remainder = divmod(runtime.seconds, 3600)
minutes, seconds = divmod(remainder, 60)

print('Runtime:')

if hours > 0:
    print(hours,'hours')
if minutes > 0:
    print(minutes,'minutes')
print(seconds,'seconds')

In [None]:
# Create DataFrame with teacher IDs
teachers_list_dict = {'teacher_id':teacher_ids,
                      'alpha_index':alpha_indices}

teachers_list_df = pd.DataFrame(teachers_list_dict)

In [None]:
# Save results to data file
# Not doing here because going to first add rows from original approach below
#teachers_list_df.to_csv('../data/teachers_list_df.csv')

### Teacher Lists -- Original Approach

https://insighttimer.com/dir/meditation-teachers/  <br>
https://insighttimer.com/dir/meditation-teachers/a <br>
https://insighttimer.com/dir/meditation-teachers/b

In [None]:
directory_alpha_index = ['hash'] + ['more']
#directory_alpha_index = ['hash'] + list(string.ascii_lowercase) + ['more']
# Result is ['hash', 'a', 'b', 'c', ... 'x', 'y', 'z', 'more']
# 'hash' page includes teachers with names starting with a number.
# 'more' page includes teachers with names starting with non-standard characters 
#     such as punctuation marks or languages other than English.

In [None]:
teachers_dir_url = 'https://insighttimer.com/dir/meditation-teachers/'

In [None]:
#initialize lists which will become columns in a dataframe 
teacher_ids = []
alpha_indices = []

In [None]:
start_time = datetime.now()

# Iterate through teacher pages for each letter
for alpha_index in directory_alpha_index:
    
    # Create a new Chrome session with a custom executable path
    url = teachers_dir_url + alpha_index

    #Create a session and load the page
    driver = webdriver.Chrome(executable_path=chrome_driver_path)
    driver.get(url)

    #Wait for page to fully load
    driver.implicitly_wait(3)

    #Make soup and close driver
    soup = BS(driver.page_source)
    driver.close()
    
    #Example of tag that includes teacher data:
    #<div class="css-1y0feak">
    #  <a href="/malhuxter">Malcolm Huxter</a>
    #</div
    
    div_teacher_tags = soup.findAll('div', attrs = {'class':'css-1y0feak'})

    for teacher_div_tag in div_teacher_tags:
        
        #Get 'a' tag contained within 'div' tag
        teacher_a_tag = teacher_div_tag.find('a')
                
        #Get href attribute and remove first "/" character to create teacher_href
        teacher_id = teacher_a_tag.get('href', default = '/no href')
        teacher_id = teacher_id[1:]

        #Add new teacher_id to list 
        teacher_ids = teacher_ids + [teacher_id]
        
        #Add current alpha index to list
        alpha_indices = alpha_indices + [alpha_index]
        
end_time = datetime.now()

In [None]:
# Create DataFrame with teacher IDs
added_teachers_list_dict = {'teacher_id':teacher_ids,
                            'alpha_index':alpha_indices}

added_teachers_list_df = pd.DataFrame(added_teachers_list_dict)

In [None]:
print(teachers_list_df.shape)
added_teachers_list_df.shape

In [None]:
final_teachers_list_df = pd.concat([teachers_list_df, added_teachers_list_df])

In [None]:
final_teachers_list_df = final_teachers_list_df.reset_index(drop=True)

In [None]:
# Save results to data file
final_teachers_list_df.to_csv('../data/teachers_list_df.csv')

In [None]:
print(final_teachers_list_df.shape)
final_teachers_list_df.head()

In [None]:
#Print Runtime 
runtime = end_time - start_time
hours, remainder = divmod(runtime.seconds, 3600)
minutes, seconds = divmod(remainder, 60)

print('Runtime:')

if hours > 0:
    print(hours,'hours')
if minutes > 0:
    print(minutes,'minutes')
print(seconds,'seconds')

Final Run was on December 8, 2022. Run time was 8 minutes 59 seconds.

old method found 12492, 
new method found 15890

463 teachers are in old method but not new method
- 3 have alpha_index = 'hash'
- 196 have alpha_index = 'more'
- 268 -- many of which no longer have pages. Not including this group.
