# Part 1 (1-6)

In [2]:
##### -- Imports -- #####
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd

##### -- Variables -- #####
newsFront = 'https://www.bbc.com/news'

##### -- Functions -- #####
def getData(data):
    response = requests.get(data)
    contents = response.text
    return contents

def matches(data):
    regexHeder = re.compile(r'<h\d(?:.*?)>(.*?)<\/h\d>')
    matches = regexHeder.findall(getData(data))
    return matches

def headerList(matchLst):
    lst = []
    for elements in matchLst:
        elements = elements.replace("&#x27;", '')
        elements = re.sub(r'<span(?:.*?)>', '', elements)
        elements = re.sub(r'<\/span>', ' ', elements)
        lst.append(elements)
    return lst

def cleaner(input):
    lst = []
    for elements in input:
        elements = str(elements)
        elements = re.sub(r'<h\d(?:.*?)>', '', elements)
        elements = re.sub(r'<\/h\d>', '', elements)
        elements = re.sub(r'<span(?:.*?)>', '', elements)
        elements = re.sub(r'<\/span>', ' ', elements)
        lst.append(elements)
    return lst

def soupHeders(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    soupList =  soup.find_all('h1') + soup.find_all('h2') + soup.find_all('h3') +  soup.find_all('h4')
    return cleaner(soupList)

def topStories(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    def find_all(tag):
        return soup.find_all(tag, class_='gs-c-promo-heading__title gel-pica-bold nw-o-link-split__text') 
    soupList = find_all('h1') + find_all('h2') + find_all('h3') + find_all('h4') 
    return cleaner(soupList)


##### -- Calls -- #####
print("")
print("RequestList")
print(headerList(matches(newsFront)))

print("")
print("SoupList")
print(soupHeders(newsFront))

print("")
print("soupTopStories")
print(topStories(newsFront))

print(len(topStories(newsFront)))



RequestList
['Accessibility links', 'News Navigation', 'BBC News Home', 'Breaking Breaking news ', 'Top Stories', 'Victory is inevitable if allies keep promises - Zelensky', 'Related content', 'WATCH: One year of war in Ukraine in 87 seconds', 'Fighting to stay Ukrainian in a frontline mining town', 'BBC correspondents on a year of witnessing war', 'Has Putins war failed?', 'Two friends changed by a year of war', 'How Putins fate is tied to his war in Ukraine', 'Related content', 'Why China launched a charm offensive over Ukraine', 'US marks war anniversary with new Russia sanctions', 'Moldova warns of Russian psy-ops as tensions rise', 'Swimmers ruined by fat-shaming and bullying', 'Rebellious Andean bear sneaks out of US zoo - twice', 'Brothers leave Guantanamo Bay after almost 20 years', 'Kenyan man freed over Britons murder-kidnap', 'Nigerian politician arrested with $500,000 in cash', 'Netflix cuts prices in more than 30 countries', 'US billionaire financier Thomas Lee found dead

# Part 1 (7-8)

In [22]:
##### -- Functions -- #####
def cleaner2(input):
    lst = []
    for elements in input:
        elements = str(elements)
        elements = re.sub(r'<p(?:.*?)>', '', elements)
        elements = re.sub(r'<\/p>', '', elements)
        lst.append(elements)
    return lst

def soupSummary(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    soupList = soup.find_all('p', class_='gs-c-promo-summary gel-long-primer gs-u-mt nw-c-promo-summary') 
    out = cleaner2(soupList)
    return out

def divider(input):
    def getHeader(string):
        string = re.search(r'<h\d(?:.*?)>(.*?)<\/h\d>', string)
        string = string.group(0)
        string = cleaner([string])
        return string[0]
    def getSummary(string):
        string = re.search(r'<p(?:.*?)>.*<\/p>', string)
        if string == None: return "NoSummery"
        string = string.group(0)
        string = cleaner2([string])
        return string[0]
    def getCategory(string):
        string = re.search(r'<span aria-hidden="true">(.*?)<\/span>', string)
        if string == None: return "NoCategory"
        string = string.group(0)
        string = re.sub(r'<span aria-hidden="true">', '', string)
        string = re.sub(r'<\/span>', '', string)
        string = re.sub(r'&amp;', 'and', string)
        return string
    lst = [getHeader(input), getSummary(input), getCategory(input)]
    return lst

def Div(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    soupList = soup.find_all('div', class_='gs-c-promo-body gel-1/2@xs gel-1/1@m gs-u-mt@m') 
    lst = []
    for elements in soupList:
        elements = str(elements)
        elements = divider(elements)
        lst.append(elements)
    return lst

def lstToPandasCsv(input):
    df = pd.DataFrame(input)
    df.columns = ['header', 'summary', 'category']
    df.to_csv('csvOut')
    return df
    
##### -- Calls -- #####
lstToPandas(Div(newsFront))

Unnamed: 0,header,summary,category
0,Why China launched a charm offensive over Ukraine,The West may come away unimpressed - but convi...,China
1,US marks war anniversary with new Russia sanct...,President Biden also announced over $2bn in mi...,US and Canada
2,Moldova warns of Russian 'psy-ops' as tensions...,Moldova's pro-EU leaders reject Russian claims...,Europe
3,Swimmers 'ruined' by fat-shaming and bullying,Former athletes tell of mistreatment at clubs ...,England
4,Rebellious Andean bear sneaks out of US zoo - ...,The South American species escaped his habitat...,US and Canada
5,Space capsule launched to fetch stranded astro...,NoSummery,NoCategory
6,Rihanna to perform Lift Me Up at the Oscars,NoSummery,NoCategory
7,Quiz of the week: What raised eyebrows at the ...,NoSummery,NoCategory
8,Outlaw-themed parade triumphs at Rio carnival,NoSummery,NoCategory
9,"Chukkas, cats and candles: Africa's top shots",NoSummery,NoCategory
