# Meditation List Scrape -- COPY

In [None]:
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as BS
import pandas as pd
import string
from datetime import date
from datetime import datetime

In [None]:
# Insight Timer webpages use javascript, so need selenium and chrome driver.
chrome_driver_path = '../../../../Tech/chrome_driver/chromedriver.exe'

### Function: Assign Batches

Assumptions/Requirements for function:
- The Teachers List Scraping notebook has been run successfully, creating the teachers_list_df.csv file.
- Teachers_list_df.csv has numeric sequential index values starting with 0.
- The batch_size argument passed to the function is an integer value between 0 and the number of rows in teachers_list_df.

***Note: Once you choose a batch size, you want to avoid reassigning batches with a different batch size. If you do reassign with a different batch size, you can redo it with the original batch size and it will be the same as it was before.***

In [None]:
def assign_med_list_batches(batch_size):
    med_teachers_list_df = pd.read_csv('../data/teachers_list_df.csv', index_col=0)
    
    num_rows = med_teachers_list_df.shape[0]    
    
    #Verify that the index values are as expected (0, 1, 2, etc.)
    index_list = med_teachers_list_df.index.to_list()
    for x in list(range(0,num_rows)):
        assert index_list[x] == x

    print(num_rows,'total rows')
    print(batch_size,'rows per batch')
    num_batches = ((num_rows - (num_rows % batch_size)) / batch_size) + min(1,num_rows % batch_size)
    print(int(num_batches),'batches')
    
    for index, row in med_teachers_list_df.iterrows():
        med_teachers_list_df.loc[index, 'med_list_batch_id'] = ((index - (index % batch_size)) / batch_size) + 1

    med_teachers_list_df.med_list_batch_id = med_teachers_list_df.med_list_batch_id.astype('int')
    
    med_teachers_list_df = med_teachers_list_df.drop(['alpha_index'], axis=1)
    
    #Save batch results to data file
    med_teachers_list_df.to_csv('../data/med_list_batches.csv')

In [None]:
batch_size = 100
#assign_med_list_batches(batch_size)

In [None]:
def scrape_med_list_one_batch(scrape_batch_id):
    med_list_batch_df = med_list_batches.copy().loc[med_list_batches.med_list_batch_id == scrape_batch_id]
    print(med_list_batch_df.shape[0],'rows in batch')
    
    #Create dataframe with a row for each meditation that this function will fill and then save.
    med_detail_batch_df = pd.DataFrame(columns=['teacher_id',
                                                'meditation_id',
                                                'med_list_batch_id',
                                                'med_list_scrape_date',
                                                'med_list_scrape_status'])

    #Batch Start
    batch_start_time = datetime.now()
    batch_row = 0

    for index, row in med_list_batch_df.iterrows():
        batch_row += 1
        print('Batch row',batch_row,'of',med_list_batch_df.shape[0],': teacher_id =', row.teacher_id)

        teacher_meditations_url = 'https://insighttimer.com/' + row.teacher_id + '/guided-meditations'

        driver = webdriver.Chrome(executable_path=chrome_driver_path)
        driver.get(teacher_meditations_url)

        #Wait for page to fully load
        driver.implicitly_wait(3)  #Two seconds is usually but not always long enough.
        
        #Make soup
        soup = BS(driver.page_source)

        h1_page_not_found_tag = soup.find('h1', attrs = {'class':'text-lg font-ProxiBold mb-6 leading-tight'})
        if h1_page_not_found_tag is None:
            page_found = True
        elif h1_page_not_found_tag.text == "The page you were looking for doesn't exist.":
            page_found = False
        else:
            page_found = True
        
        if page_found == False:
            new_row = {'teacher_id': [row.teacher_id], 
                       'med_list_batch_id': [scrape_batch_id], 
                       'med_list_scrape_date': [datetime.now()],
                       'med_list_scrape_status': ['page not found']} 
            med_detail_batch_df = pd.concat([med_detail_batch_df, pd.DataFrame(new_row)], ignore_index = True)
            
            print('PAGE NOT FOUND for teacher_id =',row.teacher_id)
            driver.close() #Close driver (closes browser window)

        else:
            #Get meditations. If not found, wait longer, remake soup, and look again.
            div_med_tag = soup.find('div', attrs = {'class':'css-1anl95l'})
            
            if div_med_tag is None:
                a_med_tags = []
            else:
                a_med_tags = div_med_tag.find_all('a', attrs = {'class':'chakra-link css-16pe414'})

            if len(a_med_tags) == 0:
                driver.implicitly_wait(5)
                soup = BS(driver.page_source)
                div_med_tag = soup.find('div', attrs = {'class':'css-1anl95l'})
                if div_med_tag is None:
                    a_med_tags = []
                else:
                    a_med_tags = div_med_tag.find_all('a', attrs = {'class':'chakra-link css-16pe414'})
            
            if len(a_med_tags) == 0:
                new_row = {'teacher_id': [row.teacher_id], 
                           'med_list_batch_id': [scrape_batch_id], 
                           'med_list_scrape_date': [datetime.now()],
                           'med_list_scrape_status': ['meditations not found']} 
                med_detail_batch_df = pd.concat([med_detail_batch_df, pd.DataFrame(new_row)], ignore_index = True)
            
                print('Meditations not found for teacher_id =',
                      row.teacher_id, 
                      'in batch_id = ',
                      scrape_batch_id)
                
                driver.close() #Close driver (closes browser window)
            else:
                #If load more button is present, press button repeatedly unless it is no longer there. 
                #Then get information on all of the meditations at once.
                while len(driver.find_elements(By.CSS_SELECTOR, 'button.css-1qsjogi')) > 0:
                    load_more_button = driver.find_element(By.CSS_SELECTOR, 'button.css-1qsjogi')
                    driver.execute_script("arguments[0].click();", load_more_button)

                    driver.implicitly_wait(2)
                    soup = BS(driver.page_source)
                    
                    div_med_tag = soup.find('div', attrs = {'class':'css-1anl95l'})
                    print('- meditations count (load more) =',len(div_med_tag.find_all('a', attrs = {'class':'chakra-link css-16pe414'})))
                
                soup = BS(driver.page_source)
                div_med_tag = soup.find('div', attrs = {'class':'css-1anl95l'})
                a_med_tags = div_med_tag.find_all('a', attrs = {'class':'chakra-link css-16pe414'})
                print('- meditations count (final) =',len(a_med_tags))
                
                for a_med_tag in a_med_tags:
                    meditation_href = a_med_tag.get('href', default = '/no href')
                    #two different formats
                    #Example 1: '/guided-meditations/z7h5n1q9m9r6h6u9e6r1q8r3u0t9x2a5y1a6r7h5'
                    #Example 2: '/andreawachter/guided-meditations/decrease-anxiety-and-increase-peace'
                    new_row = {'teacher_id': [row.teacher_id], 
                               'meditation_id': [meditation_href],
                               'med_list_batch_id': [scrape_batch_id], 
                               'med_list_scrape_date': [datetime.now()],
                               'med_list_scrape_status': ['meditation found']} 
                    med_detail_batch_df = pd.concat([med_detail_batch_df,pd.DataFrame(new_row)], ignore_index = True)
   
                driver.close() #Close driver (closes browser window)

    #Save batch results to data file
    batch_id_string = str(scrape_batch_id).zfill(5)  
    #zfill adds leading zeros which allows filenames to be sorted correctly alphabetically.
    batch_filename = '../data/med_list_batch_files/med_list_batch_' + str(batch_id_string) + '.csv'
    med_detail_batch_df.to_csv(batch_filename)

    #Batch End
    batch_end_time = datetime.now()

    #Print Runtime 
    batch_runtime = batch_end_time - batch_start_time
    hours, remainder = divmod(batch_runtime.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)

    print('Batch_id',scrape_batch_id,'completed')
    print('Batch runtime:')

    if hours > 0:
        print(hours,'hours')
    if minutes > 0:
        print(minutes,'minutes')
    print(seconds,'seconds')

### Scrape Multiple Teacher Batches

In [None]:
def scrape_med_list_multiple_batches(batch_id_range_or_list):
    if type(batch_id_range_or_list) == type(range(0,2)):
        med_list_batch_id_list = list(batch_id_range_or_list)
    elif type(batch_id_range_or_list) == type([0, 1]):
        med_list_batch_id_list = batch_id_range_or_list
    else:
        print('ERROR Wrong datatype')
    
    global med_list_batches
    med_list_batches = pd.read_csv('../data/med_list_batches.csv', index_col=0)
    
    #Verify that the index values are as expected (0, 1, 2, etc.)
    index_list = med_list_batches.index.to_list()
    for x in list(range(0,med_list_batches.shape[0])):
        assert index_list[x] == x
    
    #Start Batches
    batches_start_time = datetime.now()
    batch_num = 0
    
    for batch_id in med_list_batch_id_list:
        batch_num += 1
        print('-----------------------------------------')
        print('Starting batch', 
              batch_num,
              'out of',
              len(med_list_batch_id_list),
              ', batch_id =',
              batch_id)
        scrape_med_list_one_batch(batch_id)
        print('Completed batch_id = ',batch_id)
        print('')
        
    #End Batches
    batches_end_time = datetime.now()

    #Print Runtime 
    batches_runtime = batches_end_time - batches_start_time
    hours, remainder = divmod(batches_runtime.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)

    print('-----------------------------------------')
    print(batch_num,'batches completed with runtime:')
    if hours > 0:
        print(hours,'hours')
    if minutes > 0:
        print(minutes,'minutes')
    print(seconds,'seconds')
    print('-----------------------------------------')

### Real Use

In [None]:
batch_id_range = range(117, 140)
scrape_med_list_multiple_batches(batch_id_range)

### Example use of scrape_teachers_one_batch function

In [None]:
#med_list_batches = pd.read_csv('../data/med_list_batches.csv', index_col=0)

#Verify that the index values are as expected (0, 1, 2, etc.)
#index_list = med_list_batches.index.to_list()
#for x in list(range(0,med_list_batches.shape[0])):
#    assert index_list[x] == x

In [None]:
#scrape_batch_id = 5
#scrape_med_list_one_batch(scrape_batch_id)

In [None]:
#fresh_batch = pd.read_csv('../data/med_list_batch_files/med_list_batch_00001.csv', index_col=0)
#fresh_batch.head()