In [1]:
!pip install beautifulsoup4
!pip install requests



## Prototyping the parser logic

In [87]:
from bs4 import BeautifulSoup
from requests import get

In [88]:
page_url = 'https://www.savemyexams.com/a-level/business/edexcel/17/flashcards/1-marketing-and-people/1-1-meeting-customer-needs/'

In [89]:
request = get(page_url)
request.status_code

200

In [90]:
soup = BeautifulSoup(request.text, 'html.parser')

In [91]:
questions_container = soup.find_all('ul', class_='list-unstyled mb-0 d-flex flex-column gap-3')[0]
questions = questions_container.find_all('li')

for index, question in enumerate(questions, 1):
    try:
        question_text = question.div.div.get('aria-label')

        print(index)
        print(question_text)
        print()
    except:
        pass

1
Define the term market .

2
What is meant by the term market share ?

3
What is the aim of marketing?

4
True or False? Wants are considered to be essential.

5
Define the term mass market .

6
Define the term brand .

7
State the formula for calculating market share .

8
Define the term niche market .

9
What is the purpose of branding ?

10
True or False? A dynamic market is one that is subject to rapid or continuous changes.

11
What is meant by the term primary research ?

12
What is meant by the term market segmentation ?

13
Define the term product orientation .

14
What is meant by the term secondary research ?

15
True or False? Primary research is cheaper and quicker than secondary research.

16
State how researcher bias can affect survey findings.

17
What is the purpose of market segmentation?

18
True or False? Primary market research may include purchasing market reports from specialist companies or accessing government statistics.

19
Define the term market orientation 

## Building the parser function

In [92]:
def get_questions(url):
    request = get(url)

    if request.status_code != 200:
        print('Error getting questions')
        return

    soup = BeautifulSoup(request.text, 'html.parser')

    questions_container = soup.find_all('ul', class_='list-unstyled mb-0 d-flex flex-column gap-3')[0]
    questions = questions_container.find_all('li')

    questions_parsed = []
    for index, question in enumerate(questions, 1):
        try:
            question_text = question.div.div.get('aria-label')

            questions_parsed.append(question_text)
        except:
            pass

    return questions_parsed

In [93]:
urls = [
    (
     'https://www.savemyexams.com/a-level/business/edexcel/17/flashcards/1-marketing-and-people/1-1-meeting-customer-needs/',
     'meeting customer needs'),
    (
        'https://www.savemyexams.com/a-level/business/edexcel/17/flashcards/1-marketing-and-people/1-2-market/', 
        'market'
    ), 
    (
        'https://www.savemyexams.com/a-level/business/edexcel/17/flashcards/1-marketing-and-people/1-3-marketing-mix-and-strategy/', 
        'marketing mix and strategy'
    ), 
    (
        'https://www.savemyexams.com/a-level/business/edexcel/17/flashcards/1-marketing-and-people/1-4-managing-people/', 
        'managing people'
    ), 
    (
        'https://www.savemyexams.com/a-level/business/edexcel/17/flashcards/1-marketing-and-people/1-5-entrepreneurs-and-leaders/', 
        'entreprenuers and leaders'
    ), 
]

parsed_data = {
    topic: get_questions(url) for url, topic in urls
}

In [94]:
for topic, questions in parsed_data.items():
    print(topic, len(questions), sep=': ')

meeting customer needs: 30
market: 50
marketing mix and strategy: 50
managing people: 50
entreprenuers and leaders: 53


## Data cleaning

In [95]:
import re
import csv

In [96]:
def fix_punctuation(text):
    text = re.sub(r' *\?', '?', text)
    text = re.sub(r' *\.', '.', text)
    text = re.sub(r' *\!', '!', text)

    return text 

In [97]:
cleaned_data = []

for topic, questions in parsed_data.items():
    for question in questions:
        cleaned_data.append((fix_punctuation(question), topic))
        
cleaned_data[:10]

[('Define the term market.', 'meeting customer needs'),
 ('What is meant by the term market share?', 'meeting customer needs'),
 ('What is the aim of marketing?', 'meeting customer needs'),
 ('True or False? Wants are considered to be essential.',
  'meeting customer needs'),
 ('Define the term mass market.', 'meeting customer needs'),
 ('Define the term brand.', 'meeting customer needs'),
 ('State the formula for calculating market share.', 'meeting customer needs'),
 ('Define the term niche market.', 'meeting customer needs'),
 ('What is the purpose of branding?', 'meeting customer needs'),
 ('True or False? A dynamic market is one that is subject to rapid or continuous changes.',
  'meeting customer needs')]

## Saving the data

In [98]:
import csv

In [99]:
with open('../save_my_exams_data.csv', 'w', newline='') as csvfile:
    fieldnames = ['Questions', 'Topic']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for question, topic in cleaned_data:
        writer.writerow({'Questions': question, 'Topic': topic})

In [100]:
from notebooks.utils import DatasetReader

In [101]:
DatasetReader().read_from_file('../save_my_exams_data.csv')

Unnamed: 0,Questions,Topic
0,define the term market.,meeting_customer_needs
1,what is meant by the term market share?,meeting_customer_needs
2,what is the aim of marketing?,meeting_customer_needs
3,true or false? wants are considered to be esse...,meeting_customer_needs
4,define the term mass market.,meeting_customer_needs
...,...,...
228,what is meant by the term emotional intelligence?,entreprenuers_and_leaders
229,true or false? entrepreneurs are often driven ...,entreprenuers_and_leaders
230,what is meant by the term figurehead?,entreprenuers_and_leaders
231,true or false? many entrepreneurs are driven m...,entreprenuers_and_leaders
