# Meditation Detail Scrape

In [None]:
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as BS
import pandas as pd
import string
from datetime import date
from datetime import datetime
import json

In [None]:
# Insight Timer webpages use javascript, so need selenium and chrome driver.
chrome_driver_path = '../../../../Tech/chrome_driver/chromedriver.exe'

chrome_options = Options()
chrome_options.add_argument("--headless")

### Create Meditation Detail Batch File
This file serves two purposes. It stores the collected batched results of the meditation list scrape process. It is also used to assign meditations to batches for the meditation detail scrape process.

Records with status 'page not found' or 'meditations not found' are kept in this file as documentation for the results of the meditation list scrape process. These records will not assigned to batches for the meditation detail scrape process, however.

In [None]:
def create_meditation_detail_batch_file():
    expected_columns = ['teacher_id',
                        'meditation_id',
                        'med_list_batch_id',
                        'med_list_scrape_date',
                        'med_list_scrape_status'
                       ]
    
    #For each batch data file in the teacher_batch_files subdirectory
    batch_files_list = os.listdir('../data/med_list_batch_files/')

    batch_df_list = []

    for batch_file in batch_files_list:
        batch_df = pd.read_csv('../data/med_list_batch_files/' + batch_file, index_col = 0)    

        #Verify the batch has the correct columns in the correct order.
        correct_columns = True

        batch_columns = list(batch_df.columns)
        if len(batch_columns) == len(expected_columns):
            for col_position in range(0,len(batch_columns)):
                if batch_columns[col_position] != expected_columns[col_position]:
                    correct_columns = False
        else:
            correct_columns = False
        
        #Create list of dataframes that will be concatenated into one dataframe.
        if correct_columns:
            batch_df_list = batch_df_list + [batch_df]
        else:
            print('ERROR COLUMNS NOT AS EXPECTED',batch_file)

    global meditations_df
    meditations_df = pd.concat(batch_df_list)
    meditations_df = meditations_df.reset_index(drop = True)
    
    meditations_df.to_csv('../data/med_detail_batch.csv')
    
    print ('Total number of batch files =', len(batch_files_list))
    print ('Batch files consolidated =', len(batch_df_list))
    print ('Batch files with column errors =', len(batch_files_list) - len(batch_df_list))
    print ('-----------------')
    print ('Total number of rows =', meditations_df.shape[0])
    print ('Number of meditations = ', meditations_df.loc[meditations_df.med_list_scrape_status == 'meditation found'].shape[0])
    print ('Number of teachers with no meditations = ', meditations_df.loc[meditations_df.med_list_scrape_status == 'meditations not found'].shape[0])
    print ('Number of teacher pages not found = ', meditations_df.loc[meditations_df.med_list_scrape_status == 'page not found'].shape[0])
    print ('-----------------')
    print ('Number of teachers with meditations =', len(meditations_df.loc[meditations_df.med_list_scrape_status == 'meditation found'].teacher_id.unique()))

In [None]:
#create_meditation_detail_batch_file()

### Function: Assign Batches

Assumptions/Requirements for function:
- The Meditation List Scraping notebook has been run successfully, creating csv batch files in the med_list_batch_files subdirectory.
- Med_detail_batch_df.csv has numeric sequential index values starting with 0.
- The min_batch_size argument passed to the function is an integer value between 0 and the number of rows in teachers_list_df.
- The following data subdirectory for teacher batch files exists: ..\data\med__batch_files

***Note: Once you choose a batch size, you want to avoid reassigning batches with a different batch size. If you do reassign with a different batch size, you can redo it with the original batch size and it will be the same as it was before.***

***Note: Meditations from the same teacher may be assigned to different batches.***

In [None]:
def assign_batches(batch_size):
    
    global med_detail_batch_df
    med_detail_batch_df = pd.read_csv('../data/med_detail_batch.csv', index_col=0)
    
    #Take subset of columns to drop any prior med detail batch assignment.
    med_detail_batch_df = med_detail_batch_df[['teacher_id',
                                               'meditation_id',
                                               'med_list_batch_id',
                                               'med_list_scrape_date',
                                               'med_list_scrape_status']]
    
    num_rows = med_detail_batch_df.loc[med_detail_batch_df.med_list_scrape_status == 'meditation found'].shape[0]    
    
    #Verify that the index values are as expected (0, 1, 2, etc.)
    index_list = med_detail_batch_df.index.to_list()
    for x in list(range(0,num_rows)):
        assert index_list[x] == x

    skipped_rows = 0
    
    for index, row in med_detail_batch_df.iterrows():
        if row.med_list_scrape_status == 'meditation found':
            med_detail_batch_df.loc[index, 'med_detail_batch_id'] = (((index - skipped_rows) - ((index - skipped_rows) % batch_size)) / batch_size) + 1
        else:
            skipped_rows += 1
            med_detail_batch_df.loc[index, 'med_detail_batch_id'] = -999

    med_detail_batch_df.med_detail_batch_id = med_detail_batch_df.med_detail_batch_id.astype('int')
                                
    #Save batch results to data file
    med_detail_batch_df.to_csv('../data/med_detail_batch.csv')
    
    print(num_rows + skipped_rows,'total rows')
    print(num_rows, 'batched rows')
    print(skipped_rows, 'skipped rows')
    print(batch_size,'rows per batch')
    num_batches = ((num_rows - (num_rows % batch_size)) / batch_size) + min(1,num_rows % batch_size)
    print(int(num_batches),'batches')
    

In [None]:
#assign_batches(150)

### Function: Scrape Teachers One Batch

Assumptions/Requirements for function:
- The assign_batches function has been run to create the med_detail_batch.csv file.
- Dataframe med_detail_batch_df has been created by reading in med_detail_batch.csv file.
- med_detail_batch_df has numeric sequential index values starting with 0.
- The scrape_batch_id argument passed to the function is an integer that matches an index value for a row in med_detail_batch_df.

In [None]:
def scrape_meditations_one_batch(scrape_batch_id):
    meditations_batch = med_detail_batch.copy().loc[med_detail_batch.med_detail_batch_id == scrape_batch_id]
    print(meditations_batch.shape[0],'rows in batch')
    
    #Batch Start
    batch_start_time = datetime.now()
    batch_row = 0
    prior_teacher_id = '----------'
    teacher_meditation_row = 0

    for index, row in meditations_batch.iterrows():
        
        batch_row += 1
        if row.teacher_id == prior_teacher_id:
            teacher_meditation_row += 1
        else:
            teacher_meditation_row = 1
            prior_teacher_id = row.teacher_id
            
        #Record scrape date (Rows will have scrape_date even if page not found.)
        meditations_batch.loc[index, 'med_detail_scrape_date'] = datetime.now()

        #All meditation_id values start with '/guided-meditations/'
        #Would have removed '/guieded-meditations/' from meditation_id, but during testing some meditation 
        #urls had a different format, and the following code worked for both formats.
        meditation_url = 'https://insighttimer.com' + row.meditation_id

        driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
        driver.get(meditation_url)

        #Wait for page to fully load
        driver.implicitly_wait(3)  #Two seconds is usually but not always long enough.
        
        #Make soup
        soup = BS(driver.page_source)

        h1_page_not_found_tag = soup.find('h1', attrs = {'class':'text-lg font-ProxiBold mb-6 leading-tight'})
        if h1_page_not_found_tag is None:
            page_found = True
        elif h1_page_not_found_tag.text == "The page you were looking for doesn't exist.":
            page_found = False
        else:
            page_found = True
        
        if page_found == False:
            driver.close() #Close driver (closes browser window)
            meditations_batch.loc[index, 'med_detail_scrape_status'] = 'page not found'
            print('Batch row',batch_row,'of',
                  meditations_batch.shape[0],
                  ': teacher_id =', 
                  row.teacher_id,
                  'meditation #',
                  teacher_meditation_row)
            print('PAGE NOT FOUND for teacher_id =',
                  row.teacher_id,
                  'meditation #',
                  teacher_meditation_row,
                  'meditation_id',
                  row.meditation_id)
        else:
            #div_container_tag is the parent for a number of tags with meditation attributes.
            div_container_tag = soup.find('div', attrs = {'class':'MuiGrid-root MuiGrid-container MuiGrid-spacing-xs-4'})
        
            #Get meditation title. If not found, wait longer, remake soup, and look again.
            if div_container_tag is not None:
                h1_title_tag = div_container_tag.find('h1', attrs = {'class':'font-ProxiBold sm:text-3xl2 text-2xl2 w-full leading-tight mb-2'})
        
            if div_container_tag is None:
                title_found = False
            elif h1_title_tag is None:
                title_found = False
            else:
                title_found = True
            
            if title_found == False:
                driver.implicitly_wait(5)
                soup = BS(driver.page_source)
                div_container_tag = soup.find('div', attrs = {'class':'MuiGrid-root MuiGrid-container MuiGrid-spacing-xs-4'})
                if div_container_tag is not None:
                    h1_title_tag = div_container_tag.find('h1', attrs = {'class':'font-ProxiBold sm:text-3xl2 text-2xl2 w-full leading-tight mb-2'})

            driver.close() #Close driver (closes browser window)
            
            if div_container_tag is None:
                title_found = False
            elif h1_title_tag is None:
                title_found = False
            else:
                title_found = True
            
            if title_found == False:
                meditations_batch.loc[index, 'med_detail_scrape_status'] = 'title not found'
                print('Batch row',batch_row,'of',
                      meditations_batch.shape[0],
                      ': teacher_id =', 
                      row.teacher_id,
                      'meditation #',
                      teacher_meditation_row)
                print('MEDITATION TITLE NOT FOUND for teacher_id =',
                      row.teacher_id, 
                      'meditation #',
                      teacher_meditation_row,
                      'meditation_id',
                      row.meditation_id)
            else:
                meditations_batch.loc[index, 'med_detail_scrape_status'] = 'title found'
                meditations_batch.loc[index, 'title'] = h1_title_tag.text
                
                print('Batch row',batch_row,'of',
                      meditations_batch.shape[0],
                      ': teacher', 
                      row.teacher_id,
                      '-- med #',
                      teacher_meditation_row,
                      h1_title_tag.text)
                
            #Get other attributes that from children of div_container_tag.
            if div_container_tag is not None:
                
                #Get track type
                div_type_tag = div_container_tag.find('div', attrs = {'class':'font-ProxiSemibold text-xl capitalize flex flex-row items-center'})
                if div_type_tag is not None:
                    meditations_batch.loc[index, 'track_type'] = div_type_tag.text
                    
                #Get Activity, Suitable for, and Plays
                div_fPStxl_tags = div_container_tag.find_all('div', attrs = {'class':'font-ProxiSemibold text-xl'})
                if len(div_fPStxl_tags) == 3:
                    meditations_batch.loc[index, 'activity'] = div_fPStxl_tags[0].text
                    meditations_batch.loc[index, 'suitable_for'] = div_fPStxl_tags[1].text
                    meditations_batch.loc[index, 'plays'] = div_fPStxl_tags[2].text
                
                #Get description
                p_description_tag = div_container_tag.find('p', attrs = {'class':'text-base sm:text-lg tracking-wide leading-relaxed font-ProxiRegular'})
                if p_description_tag is not None:
                    meditations_batch.loc[index, 'description'] = p_description_tag.text
                
                #Get topics
                div_topics_tag = div_container_tag.find('div', attrs = {'class':'MuiGrid-root MuiGrid-container MuiGrid-spacing-xs-1'})
                if div_topics_tag is not None:
                    span_topic_tags = div_topics_tag.find_all('span', attrs = {'class':'MuiButton-label'}) 
                    topics = []
                    for span_topic_tag in span_topic_tags:
                        topics = topics + [span_topic_tag.text]
                    meditations_batch.loc[index, 'topics'] = ','.join(topics)
            
            #Get json tag that contains several meditation attributes
            script_json_tag = soup.find('script', attrs = {'type':'application/ld+json',
                                                           'data-react-helmet':'true'})
            if script_json_tag is not None:
                json_dict = json.loads(script_json_tag.text)
                
                #Get Upload Date
                meditations_batch.loc[index, 'upload_date'] = json_dict['@graph'][2]['uploadDate']
                
                #Get Duration
                meditations_batch.loc[index, 'duration'] = json_dict['@graph'][2]['duration']
                
                #Get Rating
                try:
                    meditations_batch.loc[index, 'rating'] = json_dict['@graph'][0]['aggregateRating']['ratingValue']
                except KeyError:
                    print('meditation has no rating')
                
                #Get Reviews Count
                try:
                    meditations_batch.loc[index, 'reviews'] = json_dict['@graph'][0]['aggregateRating']['reviewCount']
                except KeyError:
                    meditations_batch.loc[index, 'reviews'] = 0
                
                #Get Meditation URL
                meditations_batch.loc[index, 'meditation_url'] = json_dict['@graph'][2]['contentUrl']
                
                #Get Image URL
                meditations_batch.loc[index, 'image_url'] = json_dict['@graph'][2]['thumbnailUrl']
                
                #Get Category
                meditations_batch.loc[index, 'upload_date'] = json_dict['@graph'][2]['uploadDate']
                            
    #Save batch results to data file
    batch_id_string = str(scrape_batch_id).zfill(5)  
    #zfill adds leading zeros which allows filenames to be sorted correctly alphabetically.
    batch_filename = '../data/med_detail_batch_files/med_detail_batch_' + str(batch_id_string) + '.csv'
    meditations_batch.to_csv(batch_filename)

    #Batch End
    batch_end_time = datetime.now()

    #Print Runtime 
    batch_runtime = batch_end_time - batch_start_time
    hours, remainder = divmod(batch_runtime.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)

    print('-----------------------------------------')
    print('BATCH #',scrape_batch_id,'COMPLETED')
    print('Batch runtime:')

    if hours > 0:
        print(hours,'hours')
    if minutes > 0:
        print(minutes,'minutes')
    print(seconds,'seconds')
    print('-----------------------------------------')
    print('')
    print('')

Meditation Columns
- title
- length
- plays
- rating
- num_reviews
- upload_date
- type
- activity
- suitable_for
- topics
- description
- image

### Example use of scrape_teachers_one_batch function

In [None]:
#med_detail_batch = pd.read_csv('../data/med_detail_batch.csv', index_col=0)

#Verify that the index values are as expected (0, 1, 2, etc.)
#index_list = med_detail_batch.index.to_list()
#for x in list(range(0,med_detail_batch.shape[0])):#    assert index_list[x] == x

In [None]:
#scrape_batch_id = 40
#scrape_meditations_one_batch(scrape_batch_id)

### Scrape Multiple Meditation Batches

In [None]:
def scrape_meditations_multiple_batches(batch_id_range_or_list):
    if type(batch_id_range_or_list) == type(range(0,2)):
        batch_id_list = list(batch_id_range_or_list)
    elif type(batch_id_range_or_list) == type([0, 1]):
        batch_id_list = batch_id_range_or_list
    else:
        print('ERROR Wrong datatype')
    
    global med_detail_batch
    med_detail_batch = pd.read_csv('../data/med_detail_batch.csv', index_col=0)
    
    #Verify that the index values are as expected (0, 1, 2, etc.)
    index_list = med_detail_batch.index.to_list()
    for x in list(range(0,med_detail_batch.shape[0])):
        assert index_list[x] == x
        
    #Add empty columns that will be filled by scraping teacher pages.
    #Adding them here to set a standard order of columns.
    
    med_detail_batch['med_detail_scrape_date'] = None
    med_detail_batch['med_detail_scrape_status'] = None
    med_detail_batch['title'] = None
    med_detail_batch['upload_date'] = None
    med_detail_batch['duration'] = None
    med_detail_batch['plays'] = None
    med_detail_batch['rating'] = None
    med_detail_batch['reviews'] = None
    med_detail_batch['track_type'] = None
    med_detail_batch['activity'] = None
    med_detail_batch['suitable_for'] = None
    med_detail_batch['topics'] = None
    med_detail_batch['description'] = None
    med_detail_batch['meditation_url'] = None    
    med_detail_batch['image_url'] = None
    
    #Start Batches
    batches_start_time = datetime.now()
    batch_num = 0
    
    for batch_id in batch_id_list:
        batch_num += 1
        print('Starting batch', 
              batch_num,
              'out of',
              len(batch_id_list),
              '-- BATCH #',
              batch_id)
        scrape_meditations_one_batch(batch_id)
        
    #End Batches
    batches_end_time = datetime.now()

    #Print Runtime 
    batches_runtime = batches_end_time - batches_start_time
    hours, remainder = divmod(batches_runtime.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)

    print(batch_num,'batches completed with runtime:')
    if hours > 0:
        print(hours,'hours')
    if minutes > 0:
        print(minutes,'minutes')
    print(seconds,'seconds')

In [None]:
batch_id_range = range(500, 550)
scrape_meditations_multiple_batches(batch_id_range)