In [22]:
import pandas as pd
from cleantext import clean
import regex as re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

# Task 2

Import the data as a DataFrame with Pandas

In [24]:
df = pd.read_csv('news_sample.csv', sep=',', index_col=0)

We made a custom function for cleaning the text. This is due to the fact that the clean-text library does not identify URL's that omits www. We also need to find emails before URL's, so we include our own regular expression. We use simple date formats only. We use the clean function from clean-text to clean up the rest, which is line breaks, punctuations, numbers, excess white spaces and making the text lowercase. We apply the function to each content section.

In [25]:
def clean_text_help(text: str) -> str:
    email = r"[a-zA-Z0-9.%+-]+@[a-zA-Z0-9.%+-]+\.[a-z]{2,4}"
    date = r"[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{1,4}"
    url = r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*"

    text = re.sub(email, "<EMAIL>", text)
    text = re.sub(url, "<URL>", text)
    text = re.sub(date, "<DATE>", text)
    return clean(text,
                 lower=True,
                 no_line_breaks=True,
                 no_numbers=True,
                 no_punct=True)

contents = df['content'].apply(lambda x: clean_text_help(str(x)))

We used RegexpTokenizer to create a tokenizer that would treat tags such as \<URL\> as one token instead of three tokens being "<", "URL" and ">". RegexpTokenizer is from the nltk.tokenizer library

In [26]:
tokenizer = RegexpTokenizer(r'<?\w+>?')
tokens = contents.apply(tokenizer.tokenize)


We remove stopwords from each list of tokens, corresponding to the different content sections. We use a list of stopwords from the nltk library.

In [27]:
no_stopwords = []
for token in tokens:
    no_stopwords.append([word for word in token if word not in stopwords.words('english')])

We stem each list of text using the PorterStemmer from the nltk library. This reduces the vocabulary substantially.

In [28]:
ps = PorterStemmer()
stemmed = []
for lst in no_stopwords:
    stemmed.append([ps.stem(w) for w in lst])

# Task 3

For this dataset of articles all the fake news were from the domain beforeitsnews.com

In [79]:
fake = []
for idx, val in enumerate(df["type"]):
    if val == "fake":
        fake.append(idx)
print("Distribution of fake news: ", df["domain"][fake].value_counts())

number = []
for idx, val in enumerate(df["domain"]):
    if val == "beforeitsnews.com":
        number.append(idx)
print("Articles in", df["domain"][number].value_counts())


Distribution of fake news:  beforeitsnews.com    155
Name: domain, dtype: int64
Articles in beforeitsnews.com    155
Name: domain, dtype: int64


If the content contains the word trump, there is an 82,5 % chance that it is fake news. Knowing that all fake news come from beforeitnews.com and all news from beforeitnews.com are fake, we see that 132 articles with the word "trump" in it are fake and 

In [82]:
trump_indexes = []

for idx, lst in enumerate(stemmed):
    if "trump" in lst:
        trump_indexes.append(idx)
print("The chance of fake news if the word \"trump\" is in the text: ",
      (df['type'][trump_indexes] == "fake").sum() / len(trump_indexes))
print("Sources of articles with the word \"trump\" in it:")
print(df["domain"][trump_indexes].value_counts())
print('Articles mentioning trump: ', len(trump_indexes))


The chance of fake news if the word "trump" is in the text:  0.825
Sources of articles with the word "trump" in it:
beforeitsnews.com           132
washingtonexaminer.com        8
canadafreepress.com           6
www.newsmax.com               5
strategic-culture.org         2
bipartisanreport.com          1
charismanews.com              1
willyloman.wordpress.com      1
21stcenturywire.com           1
alternet.org                  1
vdare.com                     1
americanlookout.com           1
Name: domain, dtype: int64
Articles mentioning trump:  160


Here we see that if the content contains the word bitcoin, then there is a 100 % chance that it is fake news. Again, all the fake news are from the domain beforeitsnews.com.

In [80]:
bitcoin_index = []
for idx, lst in enumerate(stemmed):
    if "bitcoin" in lst:
        bitcoin_index.append(idx)
print("The chance of fake news if the word \"bitcoin\" is in the text: ",
      (df["type"][bitcoin_index] == "fake").sum()/len(bitcoin_index))
print('Amount of articles mentioning bitcoins: ', len(bitcoin_index))


The chance of fake news if the word "bitcoin" is in the text:  1.0
Amount of articles mentioning bitcoins:  131


If both words "trump" and "bitcoin" are in an article the article is produced by beforeitnews.com and based on the previous tests we know it is fake news.

In [70]:
TrumpNBitcoin_index = []

for idx, lst in enumerate(stemmed):
    if "trump" in lst and "bitcoin" in lst:
        TrumpNBitcoin_index.append(idx)
print("Source and count of articles containing the word \"trump\" and \"bitcoin\":",
      df["domain"][TrumpNBitcoin_index].value_counts())


Source and count of articles containing the word "trump" and "bitcoin": beforeitsnews.com    130
Name: domain, dtype: int64


We see that 85% of fake news is about either Trump or bitcoins

In [90]:
total_fake_Trump_bitcoin = (len(bitcoin_index) - len(TrumpNBitcoin_index)) + (
    (df['type'][trump_indexes] == "fake").sum() - len(TrumpNBitcoin_index)) + len(TrumpNBitcoin_index)
print(total_fake_Trump_bitcoin / len(number))

print(bitcoin_index)
print(TrumpNBitcoin_index)

0.8580645161290322


# Task 4

In this section we use BeautifulSoup to scrape news articles. We use json to store the data in a .json file.

In [29]:
import requests
from bs4 import BeautifulSoup
import json

Define the letters for our group, and the main link. Get the content of the page.

In [19]:
letters = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[16%23:16%23+10]
main_link = "https://en.wikinews.org/wiki/Category:Politics_and_conflicts"
main = requests.get(main_link)
content = main.text

Find the link to the section where articles starting with 'A' is listed. The idea is to get all article links from this page, then use the 'next page' link to also get articles from the next page. This way, we avoid skipping articles as we would if we used the subsection for 'B' and 'C' and so on, since they only display 200 articles in a page. So 'next page' is the way to go.

In [39]:
soup1 = BeautifulSoup(content, 'html.parser')
category = [obj['href'] for obj in soup1.find_all(
    'a',  {"class": "external text"}, href=True) if obj.get_text() in 'A']

While there is a 'next page' button, get the page and extract all the articles that are in our letter group. We do this by finding the 'div' tag that encapsulates the section of the web page that we want to look at, then finding all the 'a' tags, because they contain links. We then extract the 'href' from these. The 'next page' object will always be one of the 2 first links in this list. We then put the resulting links into a list if they are not the 'next page' or 'previous page' links, and if their first letter is from our letter group. We end up with a list of article links containing all the articles we want.

In [15]:
links = []
next_page = category
while next_page:
    page = requests.get(next_page[0] if 'http' in next_page[0]
                        else 'https://en.wikinews.org' + next_page[0]).text

    soup = BeautifulSoup(page, 'html.parser')
    parent = soup.find('div', {'id': 'mw-pages'})
    article_objects = parent.find_all('a', href=True)

    next_page = [obj['href'] for obj in article_objects[0:2] if
                 obj.get_text() == 'next page'
                 ]

    article_links = [obj['href'] for obj in article_objects if
                     (obj.get_text() != 'next page')        and
                     (obj.get_text() != 'previous page')    and
                     (obj.get_text()[0] in letters)
                     ]

    links.extend(article_links)

We wish to extract some metadata from all the articles. We want the headlines, dates and contents. So for each link, the headline is the same 'h1' in all articles. The content is all the 'p', 'h2' and 'li' tags, and the dates are usually a 'strong' tag. The date is the most complicated one, since it is not consistently done the same way. We check if we get a date from the 'strong' tag, and if we don't it is usually in a 'b' tag. We find this 'b' tag, then append it to the dates list only if we find it, and it has the right format, which we check with regular expression search. If the conditions don't apply, we give a None value to the list instead, as we cannot find the correct date (some articles have no dates). The date has been a particular challenge, but we are convinced (without checking through the whole data) that we solved it correctly most, if not all, of the time. This cell will take multiple minutes to run. It took over 6 minutes for us.

In [None]:
headlines = []
dates = []
contents = []

for link in links:
    page = requests.get('https://en.wikinews.org' + link).text
    soup = BeautifulSoup(page, 'html.parser')

    headline = soup.find('h1', {"id": "firstHeading"}).get_text()
    headlines.append(headline)

    parent = soup.find('div', {"class": "mw-parser-output"})
    date = parent.find('strong', {'class': 'published'})
    content = [obj.get_text().rstrip('\n')
               for obj in parent.find_all(['p', 'h2', 'li'])]

    if date:
        dates.append(date.get_text())
    else:
        date = parent.find('b')
        dates.append(date.get_text() if date and re.search(
            r'\w*[0-9]+', date.get_text()) else None)

    contents.append('\n'.join(content[1:]))


Zip the data together and export it to a json file. This file can be importet into new scripts when needed.

In [None]:
# export to json
data = [{'Headline': x, 'Date': y, 'Content': z}
        for x, y, z in zip(headlines, dates, contents)]

with open('data.json', 'w') as fp:
    json.dump(data, fp, indent=2, ensure_ascii=False)
