In [1]:
import time
import pandas as pd
from bs4 import BeautifulSoup
import requests
from concurrent.futures import ThreadPoolExecutor
import itertools

In [2]:
def get_urls():
    """ Historic list of urls we're scraping from as well as the current one. Needs to be changed when a new thread
    is made"""

    url_list = [
        'https://www.mumsnet.com/talk/am_i_being_unreasonable/4676538-if-you-like-wordle-plusword-is-even-better-thread-4?page=',
        #'https://www.mumsnet.com/talk/_chat/4714295-plusword-new-thread-1?page=',
        #'https://www.mumsnet.com/talk/_chat/4765702-plusword-new-thread-2?page='
    ]

    return url_list

In [3]:
def url_generator():
    
    url_list = []

    thread_list = [
        #'https://www.mumsnet.com/talk/am_i_being_unreasonable/4676538-if-you-like-wordle-plusword-is-even-better-thread-4?page=',
        #'https://www.mumsnet.com/talk/_chat/4714295-plusword-new-thread-1?page=',
        'https://www.mumsnet.com/talk/_chat/4765702-plusword-new-thread-2?page=1']

    for thread in thread_list:
        for page_number in range(1,41):
            page_url = thread + str(page_number)
            url_list.append(page_url)
    
    return url_list

In [4]:
def post_to_text_converter(post,
                           #whole_post_list
                          ):
    
    # converts to list and removes whitespace
    post_text = post.getText().split()
    
    # separates out meta data and post body
    meta_data = post_text[:4]
    post_body = post_text[4:]
    
    # removes fullstop from meta data
    meta_data.pop(1)
    
    # converts whole of post body to one string
    post_body = ' '.join(post_body)
    meta_data.append(post_body)
    whole_post = meta_data
    #whole_post_list.append(whole_post)
    
    return whole_post

In [5]:
def original_scraper(url_list):
    """ Loops over all the different types of posts on the mumsnet website, accessing the text values. Then appends
    them to a list which is finally converted to a dataframe and returned"""

    whole_post_list = []

    # maximum number of pages in thread
    max_pages = 41

    # html class of original post from the thread
    first_post_class = 'p-4 pb-1 pt-2.5 lg:py-2.5 mt-2.5 lg:mt-1.5 border-t border-b sm:border sm:rounded ' \
                       'border-mumsnet-forest-border bg-mumsnet-forest dark:bg-mumsnet-forest-dark'

    # html class of a normal post from the thread
    normal_reply_class = 'lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 overflow-x-hidden ' \
                         'bg-white dark:bg-gray-800 border-gray-200'

    # html class of a post from the thread creator
    original_poster_reply_class = 'lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 ' \
                                  'overflow-x-hidden bg-mumsnet-forest dark:bg-mumsnet-forest-dark ' \
                                  'border-mumsnet-forest-border'

    for url in url_list:

        # Increments through every page on website until it runs out for hits max_pages
        for page_number in range(max_pages):

            try:

                # gets request via bs4
                r = requests.get(url + str(page_number))
                soup = BeautifulSoup(r.content, features="html5lib")
                
                # Finds original post on first page and splits it into metadata and post text
                original_post = soup.find_all('div', class_=first_post_class)
                original_post_paragraphs = original_post[0].find_all('p')
                original_post = soup.find_all('div', class_=original_post_class)
                original_post = original_post[0].find_all('div', class_='')

                # converts to list
                meta_data = original_post_paragraphs[0].getText().split()

                # removes fullstops in position 1
                meta_data.pop(1)

                # converts text to list and then joins items together
                post_text = original_post_paragraphs[1].getText().split()
                post_text = ' '.join(post_text)

                # Adds OP metadata and text together and adds together for OP on every page
                meta_data.append(post_text)
                whole_post = meta_data
                whole_post_list.append(whole_post)

                # finds all non-OP post on page and gets data
                posts = soup.find_all('div', class_=[normal_reply_class, original_poster_reply_class])

                for post in posts:
                    post_info = post.getText().split()

                    # first 4 items are meta data
                    meta_data = post_info[:4]

                    # removes unneeded full stop
                    meta_data.pop(1)

                    # joins post text together
                    post_text = post_info[4:]
                    post_text = ' '.join(post_text)

                    # appends metadata and text together and adds to list
                    meta_data.append(post_text)
                    whole_post = meta_data
                    whole_post_list.append(whole_post)

            except Exception as e:
                print(e)
            pass

    df = pd.DataFrame(whole_post_list, columns=['user', 'date', 'time', 'text'])

    return df

In [6]:
def modified_scraper(url):

    whole_post_list = []

    # maximum number of pages in thread
    max_pages = 41

    # html class of original post from the thread
    original_post_class = 'p-4 pb-1 pt-2.5 lg:py-2.5 mt-2.5 lg:mt-1.5 border-t border-b sm:border sm:rounded ' \
                       'border-mumsnet-forest-border bg-mumsnet-forest dark:bg-mumsnet-forest-dark'

    # html class of a normal post from the thread
    normal_post_class = 'lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 overflow-x-hidden ' \
                         'bg-white dark:bg-gray-800 border-gray-200'

    # html class of a post from the thread creator
    original_poster_reply_class = 'lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 ' \
                                  'overflow-x-hidden bg-mumsnet-forest dark:bg-mumsnet-forest-dark ' \
                                  'border-mumsnet-forest-border'

    # gets request via bs4
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content, features="html5lib")

    try:
        if url[-1] == str(1) :

            # Finds original post on first page and splits it into metadata and post text
            original_post = soup.find_all('div', class_=original_post_class)
            original_post = original_post[0].find_all('div', class_='')

            whole_post=post_to_text_converter(original_post[2], whole_post_list)
    except:
        pass
    
    # finds all non-OP post on page and gets data
    posts = soup.find_all('div', class_=[normal_post_class, original_poster_reply_class])

    for post in posts:
        whole_post = post_to_text_converter(post)

    return whole_post

In [7]:
def multithread_wrapper(url_list):
    
    #url_list = url_generator()
    #url = 'https://www.mumsnet.com/talk/_chat/4765702-plusword-new-thread-2?page=1'
    
    result_list = []

    # maximum number of pages in thread
    max_pages = 41

    # html class of original post from the thread
    original_post_class = 'p-4 pb-1 pt-2.5 lg:py-2.5 mt-2.5 lg:mt-1.5 border-t border-b sm:border sm:rounded ' \
                       'border-mumsnet-forest-border bg-mumsnet-forest dark:bg-mumsnet-forest-dark'

    # html class of a normal post from the thread
    normal_post_class = 'lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 overflow-x-hidden ' \
                         'bg-white dark:bg-gray-800 border-gray-200'

    # html class of a post from the thread creator
    original_poster_reply_class = 'lg:py-2.5 pt-2.5 pb-1 p-4 border-t border-b sm:border sm:rounded mt-1.5 ' \
                                  'overflow-x-hidden bg-mumsnet-forest dark:bg-mumsnet-forest-dark ' \
                                  'border-mumsnet-forest-border'

    # gets request via bs4
    
    for url in url_list:
    
        r = requests.get(url)
        soup = BeautifulSoup(r.content, features="html5lib")

    #     if url[-1] == str(1) :

    #         # Finds original post on first page and splits it into metadata and post text
    #         original_post = soup.find_all('div', class_=original_post_class)
    #         original_post = original_post[0].find_all('div', class_='')

    #         whole_post=post_to_text_converter(original_post[2], whole_post_list)


        # finds all non-OP post on page and gets data
        posts = soup.find_all('div', class_=[normal_post_class, original_poster_reply_class])

        # for post in posts:
        #     whole_post = post_to_text_converter(post)

        with ThreadPoolExecutor() as executor:
            results = executor.map(post_to_text_converter, posts)
            for result in results:
                result_list.append(result)
    
    df = pd.DataFrame(result_list, columns=['user', 'date', 'time', 'text'])

    
    return df

In [8]:
original_start = time.time()
url_list = get_urls()
original_df = original_scraper(url_list)
original_end = time.time()
original_time = original_end - original_start

In [9]:
original_time

34.094024419784546

url_list = get_urls()
modified_start = time.time()
modified_df = modified_scraper(url_list)
modified_end = time.time()
modified_time = modified_end - modified_start

In [10]:
modified_start = time.time()
url_list = url_generator()
modified_df = multithread_wrapper(url_list)
modified_end = time.time()
modified_time = modified_end - modified_start

In [11]:
modified_time

34.3880181312561

In [12]:
original_df

Unnamed: 0,user,date,time,text
0,ILoveAllRainbowsx,13/11/2022,14:18,Previous thread: www.mumsnet.com/talk/am_i_bei...
1,Floralnomad,13/11/2022,15:57,Thanks for this @ILoveAllRainbowsx Add message...
2,Drywhitefruitycidergin,13/11/2022,16:08,Thanks @ILoveAllRainbowsx - hopefully thread 4...
3,Readytostartagain,13/11/2022,16:25,Thanks for this @ILoveAllRainbowsx . Did CW in...
4,Hepzibar,13/11/2022,16:25,Thanks for the new thread and reminder @ILoveA...
...,...,...,...,...
986,Readytostartagain,14/11/2022,08:07,1:42 today much easier than yesterday 😃 Add me...
987,UmbilicusProfundus,14/11/2022,08:09,⏱️ I just completed PlusWord in 01:16 slow sta...
988,MarmiteWine,14/11/2022,08:14,⏱️ I just completed PlusWord in 00:52 www.tele...
989,BrilliantGreenFlamingo,14/11/2022,11:57,6:19 today. Got a word wrong which threw me fo...


In [13]:
modified_df

Unnamed: 0,user,date,time,text
0,IGoWalkingAfterMidnight,10/04/2023,10:24,⏱️ I just completed PlusWord in 01:39 Add mess...
1,hoochycrone,10/04/2023,16:38,⏱️ I just completed PlusWord in 01:45 www.tele...
2,MarmiteWine,10/04/2023,18:48,00:44 here. Misspelled 7A at first attempt and...
3,Drywhitefruitycidergin,11/04/2023,00:15,⏱️ I just completed PlusWord in 02:04 Grrrrr t...
4,Sunbird24,11/04/2023,00:44,⏱️ I just completed PlusWord in 00:48 OP posts...
...,...,...,...,...
975,sanityisamyth,21/03/2023,05:44,⏱️ I just completed PlusWord in 01:03 www.tele...
976,Drywhitefruitycidergin,21/03/2023,06:24,⏱️ I just completed PlusWord in 04:04 www.tele...
977,DadDadDad,21/03/2023,07:04,1:27 for me today. Add message Save Share Repo...
978,Madcats,21/03/2023,09:40,It took me a while to understand the answer to...
