# Teacher Details Scrape

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup as BS
import pandas as pd
import string
from datetime import date
from datetime import datetime

In [None]:
# Insight Timer webpages use javascript, so need selenium and chrome driver.
chrome_driver_path = '../../../../Tech/chrome_driver/chromedriver.exe'

### Read in teacher list data

In [None]:
teachers_list_df = pd.read_csv('../data/teachers_list_df.csv',index_col=0)

In [None]:
#Eventually this column name will be changed in the teachers list scraping notebook and the teachers_list_df.csv file.
teachers_list_df = teachers_list_df.rename(columns={'teacher_name':'directory_name'})

In [None]:
teachers_list_df

### Assign Batches

In [None]:
batch_size = 10

In [None]:
print('Batch size set to',batch_size,'rows')

num_rows = teachers_list_df.shape[0]
print('Number of total rows is',num_rows)

num_batches = ((num_rows - (num_rows % batch_size)) / batch_size) + min(1,num_rows % batch_size)
print('Number of batches is',num_batches)

#perhaps incorporate divmod()

In [None]:
for index, row in teachers_list_df.iterrows():
    teachers_list_df.loc[index, 'batch_id'] = ((index - (index % batch_size)) / batch_size) + 1

teachers_list_df.batch_id = teachers_list_df.batch_id.astype('int')

In [None]:
teachers_list_df.tail(15)

### Scrape a Single Batch

In [None]:
scrape_batch_id = 578

In [None]:
teachers_batch = teachers_list_df.copy().loc[teachers_list_df.batch_id == scrape_batch_id]
print(teachers_batch.shape[0],'rows in batch')

In [None]:
#Batch Start
batch_start_time = datetime.now()

for index, row in teachers_batch.iterrows():
    print('Teacher =', row.teacher_href)
    
    teacher_url = 'https://insighttimer.com/' + row.teacher_href
    
    driver = webdriver.Chrome(executable_path=chrome_driver_path)
    driver.get(teacher_url)

    #Wait for page to fully load
    driver.implicitly_wait(3)  #Changing from 2 to 3 reduced errors.

    #Make soup and close driver
    soup = BS(driver.page_source)
    driver.close()
    
    #Get teacher_name
    h2_name_tag = soup.find('h2', attrs = {'class':'chakra-text css-nagewt'})
    teachers_batch.loc[index, 'teacher_name'] = h2_name_tag.text

    #Get location
    p_location_tag = soup.find('p', attrs = {'class':'chakra-text css-1n5ydt0'})
    teachers_batch.loc[index, 'location'] = p_location_tag.text
    
    #Get followers
    p_followers_tag = soup.find('p', attrs = {'class':'chakra-text css-brfdt9'})
    teachers_batch.loc[index, 'followers'] = p_followers_tag.text

    #Get languages
    #Need to add error processing for teachers without languages
    p_languages_tag = soup.find('p', attrs = {'class':'chakra-text css-1gmivde'})
    teachers_batch.loc[index, 'languages'] = p_languages_tag.text
    
    #Get date joined
    p_date_joined_tag = soup.find('p', attrs = {'class':'chakra-text css-d0wkpr'})
    teachers_batch.loc[index, 'date_joined'] = p_date_joined_tag.text    
    
    #Get about text
    div_about_tag = soup.find('div', attrs = {'class':'css-17179af'})
    teachers_batch.loc[index, 'about'] = div_about_tag.text   

    #Get image url
    img_image_tag = soup.find('img', attrs = {'class':'chakra-image css-1ssn357'})
    teachers_batch.loc[index, 'image_url'] = img_image_tag.get('src', default = '/no src')
    
    teachers_batch.loc[index, 'scrape_date'] = date.today()

#Save batch results to data file
batch_id_string = str(scrape_batch_id).zfill(5)
batch_filename = '../data/teacher_batch_files/teacher_batch_' + str(batch_id_string) + '.csv'
teachers_batch.to_csv(batch_filename)

#Batch End
batch_end_time = datetime.now()

#Print Runtime 
batch_runtime = batch_end_time - batch_start_time
hours, remainder = divmod(batch_runtime.seconds, 3600)
minutes, seconds = divmod(remainder, 60)

print('Batch_id',scrape_batch_id,'completed')
print('Batch runtime:')

if hours > 0:
    print(hours,'hours')
if minutes > 0:
    print(minutes,'minutes')
print(seconds,'seconds')

In [None]:
teachers_batch