# WebScraping
## Putting it all together
We're now ready to connect together the work put in gathering thread data, and the text extraction.


### Imports and Functions

In [None]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import pandas as pd
from random import randint
from time import sleep
from datetime import datetime

In [None]:
from random import choice
with open('user_agent.txt','r') as f:
    agents = f.readlines()
    agents = [x.strip() for x in agents]

## Putting it all together

### Functions
#### Thread Collection

In [None]:
def row_info_extractor(row): # We'll feed it the isolated html for a row and let it pull it apart.
    author = row['data-author']
    
    id_item = row['class'][-1]
    thread_id = int(id_item.split('-')[-1])
    
    title_div = row.find('div', class_='structItem-title')
    title = title_div.a.text.strip() # remember to .strip() off the useless spaces on the ends.
    
    date_format = '%Y-%m-%dT%H:%M:%S%z'
    date_string = row.find('time')['datetime']
    date = datetime.strptime(date_string, date_format)
    
    relative_url = title_div.a['href']
    full_url = urllib.parse.urljoin('http://uberpeople.net',relative_url)

    data_package = {'author':author,
                   'title':title,
                   'thread_id':thread_id,
                   'date':date,
                   'url':full_url}
    
    return data_package

def page_info_extractor(response):
    soup = BeautifulSoup(response.text,'lxml')
    threads_container = soup.find('div', class_="structItemContainer-group js-threadList")
    threads = threads_container.find_all('div', {'class':'structItem--thread', 'data-author':True} )
    
    page_data = []
    for row in threads:
        result = row_info_extractor(row)
        page_data.append(result)
    return page_data

#### Post Collection

In [None]:
def text_extractor(post):
    post_content = post.find('article', {'class':'message-body'})
    post_quote = post_content.find('blockquote', {'class':'bbCodeBlock--quote'})
    if post_quote is not None: 
        post_quote.decompose()
    return post_content.text.strip()

def posts_extractor(response):
    soup = BeautifulSoup(response.text, 'lxml')
    post_container = soup.find('div', {'class':'block-body js-replyNewMessageContainer'})
    posts = post_container.find_all('article', {'class':'message'})
    texts = []
    for post in posts:
        extracted = text_extractor(post)
        texts.append(extracted)
    return texts

def next_page(response):
    button = BeautifulSoup(response.text, 'lxml').find('a', {'class':'pageNav-jump--next'})
    if button is not None:
        return urllib.parse.urljoin(response.url,button['href'])
    else:
        return None

def multi_page_post_extractor(url):
    thread_text_data = []

    while True:
        response = requests.get(url, headers={'user-agent':choice(agents)}) # use the url variable currently in memory
        print(response.url)

        post_texts = posts_extractor(response)
        thread_text_data.extend(post_texts)
        
        url = next_page(response)
        if url is None:
            break
    thread_text = '\n\n'.join(thread_text_data)
    return thread_text

### Stage 1: Gathering Thread Urls
First we establish our list of threads. By establishing this before text extraction (which can be request heavy due to multiple pages) we can keep track of our data collection more easily.

In [None]:
max_page = 2
data = []
for page_no in range(1, max_page+1):
    print(f'Now retrieving page {page_no}')
    
    url = f'https://uberpeople.net/forums/Tips/page-{page_no}'
    
    response = requests.get(url, headers={'user-agent':choice(agents)})
    page_data = page_info_extractor(response)
    
    data.extend(page_data)
    
    wait_time = randint(2,8) # randomly select an integer between 2 and 8
    print(f'Waiting {wait_time} seconds...')
    
    sleep(wait_time)
print('Finished!')

In [None]:
df = pd.DataFrame(data)
df.to_csv('my_scraping.csv', index=False) # index false so it doesn't record the index as its own column in the csv

### Stage 2: Gathering Thread Text
In the cell below we do a number of things designed to allow you to gather data in chunks with simply by running the cell.
1. Load in the data, check if there is a `text` column, if not, make one and fill it with nans! (Should only run once)
2. Set how many rows we'll collect in this run, and then select that many rows that don't have text, from the dataframe
3. the try/finally block means that it will `try` to do whatever is inside the block. `finally`, whether there is an error or not, it will save the updated `df` to disk to avoid data loss.
4. We iterate over the rows in our sample using `.iterrows()` which spits out an identifying `row_number` and the `row_data` itself.
5. As the sample was made from the original df, they have the same index numbers for each row. We take the `sample`'s `row_number`, find that same row in the original `df` using `.loc` and then target the `text` column. We then assign the result of our `multi_page_post_extractor` to that cell in the `df`.
6. We then wait a random amount of seconds before the loop starts again on the next row of the sample.
7. `finally` (see point 3) once the loop is completed we save the updated `df` to disk. This means when the cell is run again, it will load the updated df, skip rows that already have text, and only collect for rows with missing text.

In [None]:
#1
df = pd.read_csv('my_scraping.csv')
if not 'text' in df:
    from numpy import nan
    df['text'] = nan

#2
chunk_size = 3

sample = df[df['text'].isna()].head(chunk_size) # our sample is set as the first rows where 'text' is empty

#3
try:
    #4
    for row_number, row_data in sample.iterrows():
        thread_url = row_data['url']
        
        #5
        df.loc[row_number,'text'] = multi_page_post_extractor(thread_url)
        
        #6
        wait_time = randint(1,3)
        print(f'Waiting {wait_time} seconds...')
        sleep(wait_time)
#7        
finally:
    df.to_csv('my_scraping.csv', index=False)
print('**DONE!!**')

We can check to see how many rows are complete in our df using the helpful readouts below..

In [None]:
num_text_filled_rows = len(df[~df['text'].isna()])
print(f'We have collected text for {num_text_filled_rows} of {len(df)} rows in this DataFrame')
print(f'This accounts for {(num_text_filled_rows / len(df))*100}% of the rows.')

Run the above again to see how it adds to the data rather than repeats itself.

In [None]:
df.head()