# Teacher Details Scrape

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup as BS
import pandas as pd
import string
from datetime import date
from datetime import datetime

In [2]:
# Insight Timer webpages use javascript, so need selenium and chrome driver.
chrome_driver_path = '../../../../Tech/chrome_driver/chromedriver.exe'

### Function: Assign Batches

Assumptions/Requirements for function:
- The Teachers List Scraping notebook has been run successfully, creating the teachers_list_df.csv file.
- Teachers_list_df.csv has numeric sequential index values starting with 0.
- The batch_size argument passed to the function is an integer value between 0 and the number of rows in teachers_list_df.

***Note: Once you choose a batch size, you want to avoid reassigning batches with a different batch size. If you do reassign with a different batch size, you can redo it with the original batch size and it will be the same as it was before.***

In [None]:
def assign_batches(batch_size):
    teachers_list_df = pd.read_csv('../data/teachers_list_df.csv', index_col=0)
    
    num_rows = teachers_list_df.shape[0]    
    
    #Verify that the index values are as expected (0, 1, 2, etc.)
    index_list = teachers_list_df.index.to_list()
    for x in list(range(0,num_rows)):
        assert index_list[x] == x

    print(num_rows,'total rows')
    print(batch_size,'rows per batch')
    num_batches = ((num_rows - (num_rows % batch_size)) / batch_size) + min(1,num_rows % batch_size)
    print(int(num_batches),'batches')

    #perhaps incorporate divmod()
    
    for index, row in teachers_list_df.iterrows():
        teachers_list_df.loc[index, 'batch_id'] = ((index - (index % batch_size)) / batch_size) + 1

    teachers_list_df.batch_id = teachers_list_df.batch_id.astype('int')
    
    #Save batch results to data file
    teachers_list_df.to_csv('../data/teachers_batch_list.csv')

### Function: Scrape Teachers One Batch

Assumptions/Requirements for function:
- The assign_batches function has been run to create the teachers_batch_list.csv file.
- Dataframe teachers_batch_list has been created by reading in teachers_batch_list.csv file.
- Teachers_batch_list has numeric sequential index values starting with 0.
- The scrape_batch_id argument passed to the function is an integer that matches an index value for a row in teachers_batch_list.

In [3]:
def scrape_teachers_one_batch(scrape_batch_id):
    teachers_batch = teachers_batch_list.copy().loc[teachers_batch_list.batch_id == scrape_batch_id]
    print(teachers_batch.shape[0],'rows in batch')
    
    #Batch Start
    batch_start_time = datetime.now()
    batch_row = 0

    for index, row in teachers_batch.iterrows():
        batch_row += 1
        print('Batch row',batch_row,'of',teachers_batch.shape[0],': teacher_id =', row.teacher_id)

        #Record scrape date (Rows will have scrape_date even if page not found.)
        teachers_batch.loc[index, 'scrape_date'] = datetime.now()

        teacher_url = 'https://insighttimer.com/' + row.teacher_id

        driver = webdriver.Chrome(executable_path=chrome_driver_path)
        driver.get(teacher_url)

        #Wait for page to fully load
        driver.implicitly_wait(2)  #Two seconds is usually but not always long enough.
        
        #Make soup
        soup = BS(driver.page_source)
    
        h1_page_not_found_tag = soup.find('h1', attrs = {'class':'text-lg font-ProxiBold mb-6 leading-tight'})
        if h1_page_not_found_tag is None:
            page_found = True
        elif h1_page_not_found_tag.text == "The page you were looking for doesn't exist.":
            page_found = False
        else:
            page_found = True
        
        if page_found == False:
            teachers_batch.loc[index, 'scrape_status'] = 'page not found'
            print('PAGE NOT FOUND for teacher_id =',row.teacher_id)
            driver.close() #Close driver (closes browser window)

        else:
            #Get teacher_name. If not found, wait longer, remake soup, and look again.
            h2_name_tag = soup.find('h2', attrs = {'class':'chakra-text css-nagewt'})
        
            if h2_name_tag is None:
                driver.implicitly_wait(5)
                soup = BS(driver.page_source)
                h2_name_tag = soup.find('h2', attrs = {'class':'chakra-text css-nagewt'})

            driver.close() #Close driver (closes browser window)
            
            if h2_name_tag is None:
                teachers_batch.loc[index, 'scrape_status'] = 'name not found'
                print('Teacher name not found for teacher_id =',
                      row.teacher_id, 
                      'in batch_id = ',
                      scrape_batch_id)
            else:
                teachers_batch.loc[index, 'scrape_status'] = 'name found'
                teachers_batch.loc[index, 'teacher_name'] = h2_name_tag.text
                
            #Get location
            p_location_tag = soup.find('p', attrs = {'class':'chakra-text css-1n5ydt0'})
            if p_location_tag is not None:    
                teachers_batch.loc[index, 'location'] = p_location_tag.text

            #Get followers
            p_followers_tag = soup.find('p', attrs = {'class':'chakra-text css-brfdt9'})
            if p_followers_tag is not None:
                teachers_batch.loc[index, 'followers'] = p_followers_tag.text

            #Get languages
            p_languages_tag = soup.find('p', attrs = {'class':'chakra-text css-1gmivde'})
            if p_languages_tag is not None:
                teachers_batch.loc[index, 'languages'] = p_languages_tag.text

            #Get date joined
            p_date_joined_tag = soup.find('p', attrs = {'class':'chakra-text css-d0wkpr'})
            if p_date_joined_tag is not None:
                teachers_batch.loc[index, 'date_joined'] = p_date_joined_tag.text    

            #Get about text
            div_about_tag = soup.find('div', attrs = {'class':'css-17179af'})
            if div_about_tag is not None:
                teachers_batch.loc[index, 'about'] = div_about_tag.text   

            #Get image url
            img_image_tag = soup.find('img', attrs = {'class':'chakra-image css-1ssn357'})
            if img_image_tag is not None:
                teachers_batch.loc[index, 'image_url'] = img_image_tag.get('src', default = '/no src')

    #Save batch results to data file
    batch_id_string = str(scrape_batch_id).zfill(5)  
    #zfill adds leading zeros which allows filenames to be sorted correctly alphabetically.
    batch_filename = '../data/teacher_batch_files/teacher_batch_' + str(batch_id_string) + '.csv'
    teachers_batch.to_csv(batch_filename)

    #Batch End
    batch_end_time = datetime.now()

    #Print Runtime 
    batch_runtime = batch_end_time - batch_start_time
    hours, remainder = divmod(batch_runtime.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)

    print('Batch_id',scrape_batch_id,'completed')
    print('Batch runtime:')

    if hours > 0:
        print(hours,'hours')
    if minutes > 0:
        print(minutes,'minutes')
    print(seconds,'seconds')

### Scrape Multiple Teacher Batches

In [4]:
def scrape_teachers_multiple_batches(batch_id_range_or_list):
    if type(batch_id_range_or_list) == type(range(0,2)):
        batch_id_list = list(batch_id_range_or_list)
    elif type(batch_id_range_or_list) == type([0, 1]):
        batch_id_list = batch_id_range_or_list
    else:
        print('ERROR Wrote datatype')
    
    global teachers_batch_list
    teachers_batch_list = pd.read_csv('../data/teachers_batch_list.csv', index_col=0)
    
    #Verify that the index values are as expected (0, 1, 2, etc.)
    index_list = teachers_batch_list.index.to_list()
    for x in list(range(0,teachers_batch_list.shape[0])):
        assert index_list[x] == x
        
    #Add empty columns that will be filled by scraping teacher pages.
    #Adding them here to set a standard order of columns.
    
    teachers_batch_list['scrape_date'] = None
    teachers_batch_list['scrape_status'] = None
    teachers_batch_list['teacher_name'] = None
    teachers_batch_list['location'] = None
    teachers_batch_list['followers'] = None
    teachers_batch_list['languages'] = None
    teachers_batch_list['date_joined'] = None
    teachers_batch_list['about'] = None
    teachers_batch_list['image_url'] = None
    
    #Start Batches
    batches_start_time = datetime.now()
    batch_num = 0
    
    for batch_id in batch_id_list:
        batch_num += 1
        print('Starting batch', 
              batch_num,
              'out of',
              len(batch_id_list),
              ', batch_id =',
              batch_id)
        scrape_teachers_one_batch(batch_id)
        print('Completed batch_id = ',batch_id)
        print('')
        
    #End Batches
    batches_end_time = datetime.now()

    #Print Runtime 
    batches_runtime = batches_end_time - batches_start_time
    hours, remainder = divmod(batches_runtime.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)

    print(batch_num,'batches completed with runtime:')
    if hours > 0:
        print(hours,'hours')
    if minutes > 0:
        print(minutes,'minutes')
    print(seconds,'seconds')

### Example use of assign_batches function

In [None]:
batch_size = 100
assign_batches(batch_size)

### Example use of scrape_teachers_one_batch function

In [None]:
teachers_batch_list = pd.read_csv('../data/teachers_batch_list.csv', index_col=0)

#Verify that the index values are as expected (0, 1, 2, etc.)
index_list = teachers_batch_list.index.to_list()
for x in list(range(0,teachers_batch_list.shape[0])):
    assert index_list[x] == x

In [None]:
scrape_batch_id = 85
scrape_teachers_one_batch(scrape_batch_id)

In [None]:
fresh_batch = pd.read_csv('../data/teacher_batch_files/teacher_batch_00085.csv', index_col=0)
fresh_batch.head()

### Real Use

In [5]:
batch_id_range = range(40, 70)
scrape_teachers_multiple_batches(batch_id_range)

Starting batch 1 out of 30 , batch_id = 40
100 rows in batch
Batch row 1 of 100 : teacher_id = birthdreams
Batch row 2 of 100 : teacher_id = drtarasleepnd
Batch row 3 of 100 : teacher_id = ayurvedawithdrew
Batch row 4 of 100 : teacher_id = dianac
Batch row 5 of 100 : teacher_id = deborahgreenmeditation
Batch row 6 of 100 : teacher_id = femininefrequencyrising
Batch row 7 of 100 : teacher_id = dorseystandish
Batch row 8 of 100 : teacher_id = guidedfromtheinside
Batch row 9 of 100 : teacher_id = soundtemple
Batch row 10 of 100 : teacher_id = donaldpincott
Batch row 11 of 100 : teacher_id = ganz.du.
Batch row 12 of 100 : teacher_id = frei.sein.
Batch row 13 of 100 : teacher_id = dillonsibilla
Batch row 14 of 100 : teacher_id = bliss_tigress
Batch row 15 of 100 : teacher_id = dennis
Batch row 16 of 100 : teacher_id = drniikee
Batch row 17 of 100 : teacher_id = dr.kim
Batch row 18 of 100 : teacher_id = danielpatka
Batch row 19 of 100 : teacher_id = doriskessel
Batch row 20 of 100 : teacher_

KeyboardInterrupt: 

### Example Use of scrape_teachers_multiple_batches

In [None]:
batch_id_range = range(104, 107)
scrape_teachers_multiple_batches(batch_id_range)

In [None]:
batch104 = pd.read_csv('../data/teacher_batch_files/teacher_batch_00104.csv', index_col=0)
batch105 = pd.read_csv('../data/teacher_batch_files/teacher_batch_00105.csv', index_col=0)
batch106 = pd.read_csv('../data/teacher_batch_files/teacher_batch_00106.csv', index_col=0)

batch_df_list = [batch104, batch105, batch106]
batches_range_result = pd.concat(batch_df_list)
batches_range_result.head(50)

In [None]:
### Example Use of scrape_teachers_multiple_batches
batch_id_list = [101, 102, 40]
scrape_teachers_multiple_batches(batch_id_list)

In [None]:
batch101 = pd.read_csv('../data/teacher_batch_files/teacher_batch_00101.csv', index_col=0)
batch102 = pd.read_csv('../data/teacher_batch_files/teacher_batch_00102.csv', index_col=0)
batch40 = pd.read_csv('../data/teacher_batch_files/teacher_batch_00040.csv', index_col=0)

batch_df_list = [batch101, batch102, batch40]
batches_list_result = pd.concat(batch_df_list)
batches_list_result.head(50)

In [None]:
teachers_batch_list = pd.read_csv('../data/teachers_batch_list.csv', index_col=0)

In [None]:
teachers_batch_list

In [None]:
teachers_batch_list['scrape_date'] = None

In [None]:
datetime.now()