# Part 1 (1-6)

<strong>What difficulties or limitations are there when using regular expressions?</strong> <br>
I found that the capabilities are the same when using either BeautifulSoup or re, but the implementation gets significantly more manageable when using BeautifulSoup. <br> <br>

In [2]:
##### -- Imports -- #####
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd

##### -- Variables -- #####
newsFront = 'https://www.bbc.com/news'

##### -- Functions -- #####
def getData(data):
    response = requests.get(data)
    contents = response.text
    return contents

def matches(data):
    regexHeder = re.compile(r'<h\d(?:.*?)>(.*?)<\/h\d>')
    matches = regexHeder.findall(getData(data))
    return matches

def headerList(matchLst):
    lst = []
    for elements in matchLst:
        elements = elements.replace("&#x27;", '')
        elements = re.sub(r'<span(?:.*?)>', '', elements)
        elements = re.sub(r'<\/span>', ' ', elements)
        lst.append(elements)
    return lst

def cleaner(input):
    lst = []
    for elements in input:
        elements = str(elements)
        elements = re.sub(r'<h\d(?:.*?)>', '', elements)
        elements = re.sub(r'<\/h\d>', '', elements)
        elements = re.sub(r'<span(?:.*?)>', '', elements)
        elements = re.sub(r'<\/span>', ' ', elements)
        lst.append(elements)
    return lst

def soupHeders(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    soupList =  soup.find_all('h1') + soup.find_all('h2') + soup.find_all('h3') +  soup.find_all('h4')
    return cleaner(soupList)

def topStories(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    def find_all(tag):
        return soup.find_all(tag, class_='gs-c-promo-heading__title gel-pica-bold nw-o-link-split__text') 
    soupList = find_all('h1') + find_all('h2') + find_all('h3') + find_all('h4') 
    return cleaner(soupList)


##### -- Calls -- #####
print("")
print("RequestList")
print(headerList(matches(newsFront)))

print("")
print("SoupList")
print(soupHeders(newsFront))

print("")
print("soupTopStories")
print(topStories(newsFront))



RequestList
['Accessibility links', 'News Navigation', 'BBC News Home', 'Breaking Breaking news ', 'Top Stories', 'Drone crash near Moscow was failed attack - governor', 'Drone crash near Moscow was failed attack - governor', 'Related content', 'Italian police arrest three over migrant boat wreck', 'Ruling party takes strong lead in Nigeria election', 'Iran probes poisoning of almost 700 schoolgirls', 'French football president resigns after scandal', 'Ancient mummy found in delivery mans bag', 'China hits out as US bans TikTok on federal devices', 'UKs Sunak in Belfast after reaching new Brexit deal', 'US Supreme Court weighs fate of student loan relief', 'Indian man extradited to Australia over beach murder', 'Fox hosts endorsed false election claims - Murdoch', 'Indian man extradited to Australia over beach murder', 'Fox hosts endorsed false election claims - Murdoch', 'Tesla to build new factory in Mexico', 'Pro cyclist fined for shooting ministers cat', 'China should be honest on

# Part 1 (7-8)

In [5]:
##### -- Functions -- #####
def cleaner2(input):
    lst = []
    for elements in input:
        elements = str(elements)
        elements = re.sub(r'<p(?:.*?)>', '', elements)
        elements = re.sub(r'<\/p>', '', elements)
        lst.append(elements)
    return lst

def soupSummary(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    soupList = soup.find_all('p', class_='gs-c-promo-summary gel-long-primer gs-u-mt nw-c-promo-summary') 
    out = cleaner2(soupList)
    return out

def divider(input):
    def getHeader(string):
        string = re.search(r'<h\d(?:.*?)>(.*?)<\/h\d>', string)
        string = string.group(0)
        string = cleaner([string])
        return string[0]
    def getSummary(string):
        string = re.search(r'<p(?:.*?)>.*<\/p>', string)
        if string == None: return "NoSummery"
        string = string.group(0)
        string = cleaner2([string])
        return string[0]
    def getCategory(string):
        string = re.search(r'<span aria-hidden="true">(.*?)<\/span>', string)
        if string == None: return "NoCategory"
        string = string.group(0)
        string = re.sub(r'<span aria-hidden="true">', '', string)
        string = re.sub(r'<\/span>', '', string)
        string = re.sub(r'&amp;', 'and', string)
        return string
    lst = [getHeader(input), getSummary(input), getCategory(input)]
    return lst

def Div(data):
    soup = BeautifulSoup(getData(data), 'html.parser')
    soupList = soup.find_all('div', class_='gs-c-promo-body gel-1/2@xs gel-1/1@m gs-u-mt@m') 
    lst = []
    for elements in soupList:
        elements = str(elements)
        elements = divider(elements)
        lst.append(elements)
    return lst

def lstToPandasCsv(input):
    df = pd.DataFrame(input)
    df.columns = ['header', 'summary', 'category']
    df.to_csv('csvOut')
    return df
    
##### -- Calls -- #####
lstToPandasCsv(Div(newsFront))

Unnamed: 0,header,summary,category
0,Ruling party takes strong lead in Nigeria elec...,Opposition parties condemn the election as a s...,Africa
1,Italian police arrest three over migrant boat ...,"The disaster's death toll rises to 64, with th...",Europe
2,Iran probes poisoning of almost 700 schoolgirls,Many Iranians believe the toxic gas poisonings...,Middle East
3,French football president resigns after scandal,Noel Le Graet resigns following a damning repo...,BBC Sport
4,Ancient mummy found in delivery man's bag,The Peruvian man said he wanted to show off th...,Latin America and Caribbean
5,Indian temple replaces elephant with robot for...,NoSummery,NoCategory
6,Cold snap leaves Barcelona covered in snow,NoSummery,NoCategory
7,Snapper captures 'face' in breaking wave,NoSummery,NoCategory
8,The tiny diamond sphere that could unlock clea...,NoSummery,NoCategory
9,Who at UN backed Russia over Ukraine? Our quiz...,NoSummery,NoCategory


# Part 2

<b> Describe the tools used and the challenges faced when creating the dataframe. </b> <br>
When creating the articleList function, which returns a list of links referring to all the articles, the temp function uses the beautifulSoup html.parser and the find_all property to get a string with the content division element that contains all the needed links. Temp takes a letter as input and concatenates this letter to the Wikinews link. The following letters: K, L, M, N, O, P, R, S, T, and U are run on temp inside the articleList function by splitting the list from the assignment description and running it in a for-loop.

The Pandas dataframe is created in the createTable function. A for-loop goes through all the links and readies the text for cleaning using request and bs4; the string is then fed to a tuple of functions that extract the relevant data. This data is then appended to a list and pasted into a Pandas dataframe.

Of the three cleaning functions, getContent unraveled the most challenges. The first challenge was that the content consisted of multiple chunks of phrases. To deal with this defContent creates a list of all \<p\>'s and converts every list element into strings. The join function then unpacks the list to prepare a string for cleaning. This leads to the next challenge: cleaning the data in a helper function called cleanAll. cleanAll consists of regular expressions that remove the date and unwanted tags. When looking at the HTML code, I noticed that the content is always appearing before the tags \<br\>, \<img\>, and \<b\>. Therefore everything coming after one of these tags is removed.

<b>Assess whether it is a reasonable choice to trust the sources when they aren't labeled. </b> <br>
If it's assumed that whoever created the list of articles only included articles they believed were legit (or at least wanted the reader to think were legit), such labels would come from the same source and therefore have the same credibility. Because of this, the existence of labels doesn't matter.

When using the list to practice programming skills, it doesn't matter either. The code would be identical if the articles were written in lorem ipsum. 

In [4]:
def cleanAll(input):
    tag = ['p', 'span', 'a', 'i']
    text = input
    text = re.sub(r'(<strong(?:.*?)>).*(<\/strong>)', '', text)
    text = re.sub(r'\s+', ' ', text)
    for element in tag:
        x = '<' + element + '(?:.*?)>'
        y = '<\/' + element + '>'
        text = re.sub(x, '', text)
        text = re.sub(y, ' ', text) 
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s,', '', text)
    text = re.sub('\<br(.*)', '', text)    text = re.sub('\<img(.*)', '', text)
    text = re.sub('\<b(.*)', '', text)
    return text

def getHeader(input):
    text = input.find('span', class_='mw-page-title-main')
    text = str(text)
    text = cleanAll(text)
    return text

def getDate(input):
    text = input.find('strong', class_='published')
    text = str(text)
    text = re.sub(r'<strong(?:.*?)>', '', text)
    text = re.sub(r'<\/strong>', ' ', text)
    text = re.sub(r'<span(?:.*?)>', '', text)
    text = re.sub(r'<\/span>', ' ', text)
    return text

def getContent(input):
    text = input.find_all('p')
    lst = []
    for elm in text:
        elm = str(elm)
        lst.append(elm)
    string = ' '.join(lst)
    string = cleanAll(string)
    return string

def articleList():
    def temp(input):
        page = 'https://en.wikinews.org/w/index.php?title=Category:Politics_and_conflicts&from=' + input
        divGroup = BeautifulSoup(getData(page), 'html.parser')
        divGroup = divGroup.find_all('div', id='mw-pages')
        divGroup = divGroup[0].find_all('div', class_='mw-category-group')
        divGroup = divGroup[0].find_all('a')
        lst = []
        for element in divGroup: 
            element = str(element)
            href_regex = r'href="([^"]+)"'
            element = re.search(href_regex, element)
            element = element.group(1)
            element = 'https://en.wikinews.org/' + element
            lst.append(element)
        return lst
    letters = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[10%23:10%23+10]
    letters = [*letters]
    lst = []
    for elemement in letters:
        elemement = temp(elemement)
        lst = lst + elemement
    return lst

def createTable():
    links = (articleList())
    lst = []
    for elm in links: 
        response = requests.get(elm)
        contents = response.text
        x = BeautifulSoup(contents, 'html.parser')
        x = [getHeader(x), getDate(x), getContent(x)]
        lst.append(x)

    df = pd.DataFrame(lst)
    df.columns = ['header', 'date', 'content']
    return df
print(createTable())

                                                 header  \
0     K'nesset Member Natan Sharansky resigns from c...   
1     Kaczynski elected as the new president of Pola...   
2       Kaczyński takes the office of Polish president    
3     Kansas Professor assaulted by angry intelligen...   
4                Karachi, Pakistan shut down by strike    
...                                                 ...   
1881  UK defers junk food deals, advertisement restr...   
1882  UK denies pressuring Scotland into Lockerbie r...   
1883  UK drugs policy petition reaches 100,000 signa...   
1884  UK economy shrinks by 0.3% in fourth quarter o...   
1885  UK elections: David Cameron becomes Prime Mini...   

                                date  \
0              Tuesday, May 3, 2005    
1          Sunday, October 23, 2005    
2         Friday, December 23, 2005    
3         Tuesday, December 6, 2005    
4              Monday, May 14, 2007    
...                              ...   
1881       

The Pandas dateframe may look odd compared to part 1. This happens only in Jupyter and works correctly when the script is run in the command prompt. Note that I only refer to the visualization, not the actual structure.

In [6]:
print(articleList())

