## Proof of concept parsing

In [58]:
from bs4 import BeautifulSoup
from requests import get 

In [59]:
page_url = 'https://www.savemyexams.com/a-level/business/edexcel/17/topic-questions/1-marketing-and-people/1-5-entrepreneurs-and-leaders/exam-questions/'

In [63]:
request = get(page_url)
request.status_code

200

In [64]:
# soup = BeautifulSoup(open('../html/meeting_customer_needs_easy.txt').read(), "html.parser")
soup = BeautifulSoup(request.text, "html.parser")

In [72]:
block = soup.find_all('div', class_='tab-content')[0]

In [73]:
for index, article in enumerate(block.find_all('article'), 1):
    question = ''
    for paragraph in article.find_all('p'):
        if "How did you do?" in paragraph.text: break
        
        question += paragraph.text + '\n'

    print(index)
    print(question)


1
Read the following extracts (A to C) before answering 
Explain one business objective Tesco might be aiming to achieve by launching its Jack’s stores

2
Read the following extracts (A to D) before answering 
Explain one non-financial reward for Lord Somerleyton and his business partner Toby Marchant of setting up and running Hot Chip

3
Read the following extracts (A to C) before answering
Explain one potential trade-off of Mark Rowntree's decision to continue to focus on selling Bob Bon's products in its niche market

4
Read the following extracts (A to D) before answering
Explain one implication for Mumtaz's owners of remaining as a private limited company



5
Read the following extracts (A to D) before answering
Explain one benefit to Warby Parker of establishing the business with a socially responsible business objective

6
Read the following extracts (E to G) before answering
Explain one barrier to entrepreneurship Hardy Punglia is likely to have faced as he set up his business

## Building the parser

In [74]:
def parse(url):
    soup = BeautifulSoup(get(url).text, "html.parser")
    block = soup.find_all('div', class_='tab-content')[0]
    
    questions = []

    for article in block.find_all('article'):
        question = ''
        for paragraph in article.find_all('p'):
            if "How did you do?" in paragraph.text: break
            
            question += paragraph.text + '\n'
        questions.append(question)
    return questions

In [76]:
urls = [
    (
     'https://www.savemyexams.com/a-level/business/edexcel/17/topic-questions/1-marketing-and-people/1-1-meeting-customer-needs/exam-questions/',
     'meeting customer needs'),
    (
        'https://www.savemyexams.com/a-level/business/edexcel/17/topic-questions/1-marketing-and-people/1-2-market/exam-questions/', 
        'market'
    ), 
    (
        'https://www.savemyexams.com/a-level/business/edexcel/17/topic-questions/1-marketing-and-people/1-3-marketing-mix-and-strategy/exam-questions/', 
        'marketing mix and strategy'
    ), 
    (
        'https://www.savemyexams.com/a-level/business/edexcel/17/topic-questions/1-marketing-and-people/1-4-managing-people/exam-questions/', 
        'managing people'
    ), 
    (
        'https://www.savemyexams.com/a-level/business/edexcel/17/topic-questions/1-marketing-and-people/1-5-entrepreneurs-and-leaders/exam-questions/', 
        'entreprenuers and leaders'
    ), 
]

parsed_data = {
    topic: parse(url) for url, topic in urls
}

In [77]:
for topic, questions in parsed_data.items():
    print(topic, len(questions), sep=': ')

meeting customer needs: 15
market: 12
marketing mix and strategy: 14
managing people: 12
entreprenuers and leaders: 13


In [79]:
import re

In [88]:
def fix_punctuation(text):
    text = re.sub(r' *\?', '?', text)
    text = re.sub(r' *\.', '.', text)
    text = re.sub(r' *\!', '!', text)

    text = text.replace('\n', ' ')

    return text 

In [89]:
cleaned_data = []

for topic, questions in parsed_data.items():
    for question in questions:
        cleaned_data.append((fix_punctuation(question), topic))
        
cleaned_data[:10]

[('Read the following extracts (A to D) before answering. Air passenger numbers were forecast to grow from 3.5bn people in 2015 to 3.75bn people in 2016 Using the data from Extract A and the information above, calculate the difference in percentage growth in air passenger numbers between 2015 and 2016. You are advised to show your working ',
  'meeting customer needs'),
 ('Read the following extracts (A to C) before answering Between 2010 and 2011, the number of music streaming subscribers grew by 62.34% Using the data from Extract A, calculate, to 2 decimal places, the difference between percentage growth in 2010 to 2011 and that in 2019 to 2020. You are advised to show your working ',
  'meeting customer needs'),
 ('Read the following extracts (A to C) before answering In 2018, the value of the UK market for sugar-free sweets was 3% of total sales revenueIn 2023, it is forecast to be 5% of total sales revenue Using the data in Extract A, calculate the change in value of sugar-free sw

In [90]:
import csv 

In [91]:
with open('../save_my_exams_data_topic_questions.csv', 'w', newline='') as csvfile:
    fieldnames = ['Questions', 'Topic']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for question, topic in cleaned_data:
        writer.writerow({'Questions': question, 'Topic': topic})

In [92]:
from notebooks.utils import DatasetReader

In [93]:
DatasetReader().read_from_file('../save_my_exams_data_topic_questions.csv')

Unnamed: 0,Questions,Topic
0,read the following extracts (a to d) before an...,meeting_customer_needs
1,read the following extracts (a to c) before an...,meeting_customer_needs
2,read the following extracts (a to c) before an...,meeting_customer_needs
3,read the following extracts (a to c) before an...,meeting_customer_needs
4,read the following extracts (d to g) before an...,meeting_customer_needs
...,...,...
61,read the following extracts (e to h) before an...,entreprenuers_and_leaders
62,read the following extracts (a to d) before an...,entreprenuers_and_leaders
63,read the following extracts (e to g) before an...,entreprenuers_and_leaders
64,read the following extracts (a to c) before an...,entreprenuers_and_leaders
