# WebScraping
## 6. Joining Thread Data and Text
We're now ready to connect together the work put in gathering thread data from last week, and the text extraction from this week.




### Imports and Functions

In [None]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import pandas as pd
from random import randint
from time import sleep

#### Thread functions

In [None]:
def row_info_extractor(row): # We'll feed it the isolated html for a row and let it pull it apart.
    author = row['data-author']
    
    id_item = row['class'][-1]
    thread_id = int(id_item.split('-')[-1])
    
    title_div = row.find('div', class_='structItem-title')
    title = title_div.a.text.strip() # remember to .strip() off the useless spaces on the ends.
    
    date = row.find('time')['datetime']
    
    views = row.find('dl',class_='pairs pairs--justified structItem-minor').dd.text

    relative_url = title_div.a['href']
    full_url = urllib.parse.urljoin('http://uberpeople.net',relative_url)
    
    data_package = {'id': thread_id,
                  'author': author,
                  'title': title,
                  'date': date,
                  'views': views,
                  'url': full_url}
    
    return data_package

def page_info_extractor(response):
    soup = BeautifulSoup(response.text,'lxml')
    threads_container = soup.find('div', class_="structItemContainer")
    threads = threads_container.find_all('div',class_='structItem--thread')
    
    page_data = []
    for row in threads:
        result = row_info_extractor(row)
        page_data.append(result)
    return page_data

#### Post Functions

In [None]:
def text_extractor(post):
    post_content = post.find('article', class_='message-body')
    quotes = post_content.find_all('blockquote', class_='bbCodeBlock--quote') 
    
    if quotes is not None: 
        for quote in quotes:
            quote.decompose()
    return post_content.text.strip()

def page_posts_extractor(response):
    soup = BeautifulSoup(response.text, 'lxml')
    post_container = soup.find('div', class_='p-body-content')
    posts = post_container.find_all('article', class_='message')
    texts = []
    for post in posts:
        extracted = text_extractor(post)
        texts.append(extracted)
    return texts

def get_next_url(response):
    soup = BeautifulSoup(response.text, 'lxml')
    next_button = soup.find('a', class_='pageNav-jump--next')
    if next_button == None:
        result = None
    else: 
        result = next_button['href']
    return result

def thread_post_extractor(url):
    
    thread_text_data = []
    original_url = url
    condition = True

    while condition:
        response = requests.get(url) # use the url variable currently in memory
        print(response.url)

        post_texts = page_posts_extractor(response)
        thread_text_data.extend(post_texts)

        next_url = get_next_url(response)

        if next_url is not None: # if there is a next url...
            url = urllib.parse.urljoin(original_url,next_url) # overwrite the url variable with the url from the next button
            # return to the beginning of the loop with the new url in memory

        else: # however if there is no next button...
            condition = False #set condition to False
    thread_text = '\n\n****\n\n'.join(thread_text_data)
    return thread_text

#### DF Functions


In [None]:
# FIXED MISSING VALUES!!
def view_fixer(view_string):
    view_string = view_string.replace('K','000')
    view_string = view_string.replace('–', '0')
    view_integer = int(view_string)
    return view_integer

### 1. Loading in our Thread Data

### 1a. If we don't have much time!
Load in our Dataframe we created last week, and sample three random rows using `.sample()`

In [None]:
df = pd.read_pickle('my_uber_df.pkl')

### ...now skip to part 2

### 1a. If we have time - Let's recap from the beginning

In [None]:

max_page = 3
data = []
for page_no in range(1, max_page+1):
    print(f'Now retrieving page {page_no}')
    
    url = f'https://uberpeople.net/forums/Tips/page-{page_no}'
    
    response = requests.get(url)
    page_data = page_info_extractor(response)
    
    data.extend(page_data)
    
    wait_time = randint(2,8) # randomly select an integer between 2 and 8
    print(f'Waiting {wait_time} seconds...')
    
    sleep(wait_time)
print('Finished!')

df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'])
df['views'] = df['views'].apply(view_fixer)

In [None]:
df.head()

In [None]:
# save it for later!

df.to_pickle('my_uber_df.pkl')

# 2. Gathering Post Text and Linking it

In [None]:
# Lets take the first row of our dataframe to work with

first_row_index = 0
first_row = df.loc[first_row_index]
first_row

The basic functionality of our script will be...

- Take a row from our df of threads
- Get the url from that row
- Feed that url to our `thread_post_extractor()` function to get the whole text from the thread.
- Assign that set of text as the value in a 'text' column, at that particular row position in our dataframe
- Repeat

In [None]:
# To demonstrate on one row

# we get the url
thread_url = first_row['url']

# we get the text
thread_text = thread_post_extractor(thread_url)

In [None]:
# we assign thread_text to the text column at that row position.
# we don't have a text column yet but assigning a value to it will create the column
# we want to assign this text back to our original dataframe

df.loc[first_row_index,'text'] = thread_text

In [None]:
df.head()

### Pandas .iterrows()
`iterrows` meaning 'iterate over rows'.

Although generally it is faster to use broadcasting rather than loop over dataframe rows, sometimes it can be useful. We do this with `.iterrows()` which for every loop returns two things
- the value of the index i.e. the row number 
- the rest of the row data

So rather than having...

```
for item in our_iterable:
    do something with item.
```

We have

```
for index_value, rest_of_row_data in our_iterable:
    do something with our index_value
    do something else with rest_of_row_data
```

In [None]:
# to demonstrate lets create a random sample of 3 rows

sample = df.sample(3)
sample

In [None]:


for row_index, row_data in sample.iterrows():
    print(f"This is the row index: {row_index}")
    print(f"This is the url: {row_data['url']}")    
    print(f"This is the title: {row_data['title']}")
    print(f"This is the author: {row_data['author']}")
    print()

In [None]:
# so we can use this iteration to automate our text collection row by row



for row_number, row_data in sample.iterrows():
    thread_url = row_data['url']
    
    # remember we want to assign the text back to the original dataframe, not the sample view.
    
    df.loc[row_number,'text'] = thread_post_extractor(thread_url)
    wait_time = randint(1,3)
    print(f'Waiting {wait_time} seconds...')
    sleep(wait_time)

In [None]:
# we can now check how many rows of our dataframe have text data

df.info()

In [None]:
# we can see which rows have text using .notna()  ... The reverse of .isna()

has_text_filter = df['text'].notna()
df[has_text_filter]


In [None]:
# and we can see which rows still need text by using .isna()

without_text_filter = df['text'].isna()
df[without_text_filter]

# 3. Fully Automated Luxury Webscraping

We can use these filters, alongside saving and loading to disk, to build ourselves a semi-automated controlled system. First, above, you would gather your threads data to give yourself a list of threads which then need filling in with text. We chose to gather 3 pages worth, which gave us a total of 60 threads.

Our semi-automated system should go like this....

1. Load our threads dataframe from disk
2. Define how many rows we're going to gather text for in this session
3.
  - `If` our dataframe doesn't yet have a column called 'text' then just grab the top however many rows we're collecting as a sample.
  - `Else` we want to filter out rows that already have text, and then take the first however many rows we're collecting as a sample.
 
4. We iterate over the sample rows collecting the text, and then assigning it to the 'text' column  at that row position in our original dataframe (not the sample).
5. When we are done we save the orignal dataframe back to disk, ready to repeat at our leaisure.

In [None]:
# load in your data 
df = pd.read_pickle('my_uber_df.pkl') # we load in any saved data

# how many rows will we do this session?
chunk_size = 3

# if we've already started collecting text we want to only sample from rows that don't have text
# (otherwise we'll gather than same text over and over)

if 'text' in df: # this says if the column 'text' is in the dataframe...
    filtered = df[df.text.isna()] # select only rows where the text is missing
    sample = filtered.head(chunk_size) #our sample is set as the first rows of the filtered dataframe, as determined by our chunk size

else: #if text isn't a column in the df...
    sample = df.head(chunk_size) # ...forget the filtering and just grab the first rows.
    

try:
    for row_number, row_data in sample.iterrows():
        thread_url = row_data['url']
        
        # despite the filtering and sampling we still update the original df with our text data
        df.loc[row_number,'text'] = thread_post_extractor(thread_url) 
        wait_time = randint(1,3)
        print(f'Waiting {wait_time} seconds...')
        sleep(wait_time)
finally:
    df.to_pickle('my_uber_df.pkl')
print('**DONE!!**')

In [None]:
num_text_filled_rows = len(df[df['text'].notna()])
print(f'We have collected text for {num_text_filled_rows} of {len(df)} rows in this DataFrame')

Run the above again to see how it adds to the data rather than repeats itself.

# Exporting
If you want to use the text information in a different software package such as NVivo or MaxQDA then you just need to export as a CSV or an excel file.

In [None]:
df.to_csv('my_uber.csv')

Clear timezone data so you can export to excel for nvivo


In [None]:
df['date'] = df['date'].apply(lambda x: x.replace(tzinfo=None)) #excel doesn't like timezones!

In [None]:
df.to_excel('my_uber.xlsx')