# Part 1 (1-6)

In [95]:
##### -- Imports -- #####
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd

##### -- Variables -- #####
newsFront = 'https://www.bbc.com/news'

##### -- Functions -- #####
def getData(data):
    response = requests.get(data)
    contents = response.text
    return contents

def matches(data):
    regexHeder = re.compile(r'<h\d(?:.*?)>(.*?)<\/h\d>')
    matches = regexHeder.findall(getData(data))
    return matches

def headerList(matchLst):
    lst = []
    for elements in matchLst:
        elements = elements.replace("&#x27;", '')
        elements = re.sub(r'<span(?:.*?)>', '', elements)
        elements = re.sub(r'<\/span>', ' ', elements)
        lst.append(elements)
    return lst

def cleaner(input):
    lst = []
    for elements in input:
        elements = str(elements)
        elements = re.sub(r'<h\d(?:.*?)>', '', elements)
        elements = re.sub(r'<\/h\d>', '', elements)
        elements = re.sub(r'<span(?:.*?)>', '', elements)
        elements = re.sub(r'<\/span>', ' ', elements)
        lst.append(elements)
    return lst

def soupHeders(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    soupList =  soup.find_all('h1') + soup.find_all('h2') + soup.find_all('h3') +  soup.find_all('h4')
    return cleaner(soupList)

def topStories(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    def find_all(tag):
        return soup.find_all(tag, class_='gs-c-promo-heading__title gel-pica-bold nw-o-link-split__text') 
    soupList = find_all('h1') + find_all('h2') + find_all('h3') + find_all('h4') 
    return cleaner(soupList)


##### -- Calls -- #####
print("")
print("RequestList")
print(headerList(matches(newsFront)))

print("")
print("SoupList")
print(soupHeders(newsFront))

print("")
print("soupTopStories")
print(topStories(newsFront))

print(len(topStories(newsFront)))



RequestList
['Accessibility links', 'News Navigation', 'BBC News Home', 'Breaking Breaking news ', 'Top Stories', 'At least 59 migrants killed in shipwreck off Italy', 'At least 59 migrants killed in shipwreck off Italy', 'Related content', 'British and EU leaders to meet over NI Brexit deal', 'Israel and Palestinians pledge to reduce violence', 'Early results arriving from tight Nigeria election', 'Power, privilege and murder - the downfall of a dynasty', 'Madonnas brother Anthony Ciccone dies aged 66', 'Olivia Newton-John honoured at Australia memorial', 'India opposition official held in corruption probe', 'US media drop Dilbert comics over creators racist tirade', 'Ukraines Banksy stamps depict Putin being thrown', 'Archaeologists freed in Papua New Guinea', 'Ukraines Banksy stamps depict Putin being thrown', 'Archaeologists freed in Papua New Guinea', 'Turkey arrests 180 over quake-collapsed buildings', 'Rare winter storm wreaks havoc in California', 'Motorhome falls into river a

# Part 1 (7-8)

In [321]:
##### -- Functions -- #####
def cleaner2(input):
    lst = []
    for elements in input:
        elements = str(elements)
        elements = re.sub(r'<p(?:.*?)>', '', elements)
        elements = re.sub(r'<\/p>', '', elements)
        lst.append(elements)
    return lst

def soupSummary(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    soupList = soup.find_all('p', class_='gs-c-promo-summary gel-long-primer gs-u-mt nw-c-promo-summary') 
    out = cleaner2(soupList)
    return out

def divider(input):
    def getHeader(string):
        string = re.search(r'<h\d(?:.*?)>(.*?)<\/h\d>', string)
        string = string.group(0)
        string = cleaner([string])
        return string[0]
    def getSummary(string):
        string = re.search(r'<p(?:.*?)>.*<\/p>', string)
        if string == None: return "NoSummery"
        string = string.group(0)
        string = cleaner2([string])
        return string[0]
    def getCategory(string):
        string = re.search(r'<span aria-hidden="true">(.*?)<\/span>', string)
        if string == None: return "NoCategory"
        string = string.group(0)
        string = re.sub(r'<span aria-hidden="true">', '', string)
        string = re.sub(r'<\/span>', '', string)
        string = re.sub(r'&amp;', 'and', string)
        return string
    lst = [getHeader(input), getSummary(input), getCategory(input)]
    return lst

def Div(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    soupList = soup.find_all('div', class_='gs-c-promo-body gel-1/2@xs gel-1/1@m gs-u-mt@m') 
    lst = []
    for elements in soupList:
        elements = str(elements)
        elements = divider(elements)
        lst.append(elements)
    return lst

def lstToPandasCsv(input):
    df = pd.DataFrame(input)
    df.columns = ['header', 'summary', 'category']
    df.to_csv('csvOut')
    return df
    
##### -- Calls -- #####
lstToPandas(Div(newsFront))

Unnamed: 0,header,summary,category
0,EU chief and Sunak to hold 'final talks' on NI...,Ursula von der Leyen will be in the UK on Mond...,UK Politics
1,On board with an airman fighting Ukraine's war...,The BBC gets rare access to a renowned airman ...,Europe
2,Early results arriving from tight Nigeria elec...,The first two of Nigeria's 36 states announce ...,Africa
3,Fury beats Jake Paul in highly anticipated box...,Tommy Fury beats Jake Paul by split decision i...,BBC Sport
4,Israel and Palestinians pledge to reduce violence,"As rare talks were held in Jordan, a Palestini...",Middle East
5,Your pictures on the theme of 'shelter',NoSummery,NoCategory
6,Thousands moved to El Salvador mega-prison,NoSummery,NoCategory
7,"Red balloons, oranges, and carnival dogs: Phot...",NoSummery,NoCategory
8,Vigils and flags: Ukraine war anniversary in p...,NoSummery,NoCategory
9,Space capsule launched to fetch stranded astro...,NoSummery,NoCategory


# Part 2

In [362]:
def cleanAll(input):
    tag = ['p', 'span', 'a', 'i']
    text = input
    text = re.sub(r'(<strong(?:.*?)>).*(<\/strong>)', '', text)
    text = re.sub(r'\s+', ' ', text)
    for element in tag:
        x = '<' + element + '(?:.*?)>'
        y = '<\/' + element + '>'
        text = re.sub(x, '', text)
        text = re.sub(y, ' ', text) 
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s,', '', text)
    text = re.sub('\<br(.*)', '', text)
    text = re.sub('\<img(.*)', '', text)
    text = re.sub('\<b(.*)', '', text)
    return text

def getHeader(input):
    text = input.find('span', class_='mw-page-title-main')
    text = str(text)
    text = cleanAll(text)
    return text

def getDate(input):
    text = input.find('strong', class_='published')
    text = str(text)
    text = re.sub(r'<strong(?:.*?)>', '', text)
    text = re.sub(r'<\/strong>', ' ', text)
    text = re.sub(r'<span(?:.*?)>', '', text)
    text = re.sub(r'<\/span>', ' ', text)
    return text

def getContent(input):
    text = input.find_all('p')
    lst = []
    for elm in text:
        elm = str(elm)
        lst.append(elm)

    string = ' '.join(lst)
    string = cleanAll(string)
    return string

def articleList():
    def temp(input):
        page = 'https://en.wikinews.org/w/index.php?title=Category:Politics_and_conflicts&from=' + input
        divGroup = BeautifulSoup(getData(page), 'html.parser')
        divGroup = divGroup.find_all('div', id='mw-pages')
        divGroup = divGroup[0].find_all('div', class_='mw-category-group')
        divGroup = divGroup[0].find_all('a')
        lst = []
        for element in divGroup: 
            element = str(element)
            href_regex = r'href="([^"]+)"'
            element = re.search(href_regex, element)
            element = element.group(1)
            element = 'https://en.wikinews.org/' + element
            lst.append(element)
        return lst
    letters = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[10%23:10%23+10]
    letters = [*letters]
    lst = []
    for elemement in letters:
        elemement = temp(elemement)
        lst = lst + elemement
    return lst

def createTable():
    links = (articleList())[:10]
    lst = []
    for elm in links: 
        response = requests.get(elm)
        contents = response.text
        x = BeautifulSoup(contents, 'html.parser')
        x = [getHeader(x), getDate(x), getContent(x)]
        lst.append(x)

    df = pd.DataFrame(lst)
    df.columns = ['header', 'summary', 'content']
    return df
print(createTable())

                                              header  \
0  K'nesset Member Natan Sharansky resigns from c...   
1  Kaczynski elected as the new president of Pola...   
2    Kaczyński takes the office of Polish president    
3  Kansas Professor assaulted by angry intelligen...   
4             Karachi, Pakistan shut down by strike    
5  Karen Handel wins runoff for seat in United St...   
6         Karl Rove named as a source of Plame leak    
7  Karl Rove refuses to testify before House Judi...   
8  Karl Rove, senior political advisor to Preside...   
9  Karzai blames U.S. and Britain for increased o...   

                       summary  \
0        Tuesday, May 3, 2005    
1    Sunday, October 23, 2005    
2   Friday, December 23, 2005    
3   Tuesday, December 6, 2005    
4        Monday, May 14, 2007    
5       Friday, June 23, 2017    
6        Sunday, July 3, 2005    
7     Thursday, July 10, 2008    
8     Monday, August 13, 2007    
9        Monday, May 23, 2005    

        