# Import

In [2]:
from bs4 import BeautifulSoup
import datetime
import json
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import pickle
import requests
import time
from html.parser import HTMLParser
import lxml
from lxml.html.clean import Cleaner
import re
from random import randint

# Methods/Global

In [None]:
def clean_me(url):
    time.sleep(randint(0, 3))
        
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
    }
    
    try:
        html  = requests.get(url, headers=headers).text
    except:
        return 'none'
    
    soup = BeautifulSoup(html, "html.parser") # create a new bs4 object from the html data loaded
    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
        script.extract()
    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    text = text.replace('\n', ' ')
    text = text.replace('|', ' ')
    return text

# Load Cleaned Data

In [10]:
_  = os.path.join('data', 'clean_kaggle_fakenews_train.pkl')
_ = open(_, 'rb')
_ = pickle.load(_)
kaggle_fakenews_df = _

In [12]:
_  = os.path.join('data', 'clean_huff.pkl')
_ = open(_, 'rb')
_ = pickle.load(_)
huff_df = _

In [3]:
_  = os.path.join('data', 'clean_buzzfeed_fakenews.pkl')
_ = open(_, 'rb')
_ = pickle.load(_)
buzz_feed_fakenews_df = _

In [None]:
file_path = os.path.join('data', 'nytimes_news_articles.txt')
articles = []
urls = []
index = -1
with open(file_path, 'r', encoding="utf-8") as file:
    lines =  file.readlines()  
    for line in lines:    
        if line.find('URL:') != -1:
            index += 1
            urls.append(line.split()[1])
            articles.append('')
        elif line != '':
            articles[index] = articles[index] + line  + ' '
        else:
            pass

new_york_times_df = pd.DataFrame({'total': articles})

# Kaggle Fakenews

In [101]:
# Combining all text to one big string
kaggle_fakenews_df['total'] = kaggle_fakenews_df.drop(columns=['label']).values.sum(axis=1)
kaggle_fakenews_df.to_pickle("data/prep_kaggle_fakenews_train.pkl")

# Huffleton Post

In [4]:
huff_df.drop(columns=['date', 'category'], inplace=True)

# Scrape articles from all the urls
file_path  = os.path.join('data', 'huff_articles.txt')
outF = open(file_path, "a", encoding="utf-8") 
for index, value in enumerate(huff_df['link'][:6000]): 
    outF.write(clean_me(value)) 
    outF.write("\n")
    if index%100 == 0:
        print(datetime.datetime.now(), 'index', index)     
outF.close()      

file_path = os.path.join('data', 'huff_articles.txt')
with open(file_path, 'r', encoding="utf-8") as file:
    _ =  file.readlines()    
huff_df['text'] =  pd.Series(_)
huff_df.fillna(' ', inplace=True)
huff_df['total'] = huff_df.drop(columns=['link']).values.sum(axis=1)
huff_df['label'] = 0
huff_df.iloc[:2167].to_pickle("data/prep_huff.pkl")

# BuzzFeed Fakenews

In [None]:
# Scrape articles from all the urls
file_path  = os.path.join('data', 'buzzfeed_fake_articles.txt')
outF = open(file_path, "a", encoding="utf-8") 
for index, value in enumerate(buzz_feed_fakenews_df['url']): 
    outF.write(clean_me(value)) 
    outF.write("\n")
    if index%100 == 0:
        print(datetime.datetime.now(), 'index', index)     
outF.close()      

# Make a new author column and set it equal to the domain of the url
buzz_feed_fakenews_df['author'] = buzz_feed_fakenews_df['url'].map(lambda x: x.split('/')[2])

#Make a new text column using all the scrap articles
file_path = os.path.join('data', 'buzzfeed_fake_articles.txt')
with open(file_path, 'r', encoding="utf-8") as file:
    _ =  file.readlines()    
buzz_feed_fakenews_df['text'] =  pd.Series(_)

#Fill any null values with empty string they're all fake so it doesn't matter
buzz_feed_fakenews_df.fillna(' ', inplace=True)

#Remove column to prepare to merge with kaggle dataset
buzz_feed_fakenews_df.drop(columns=['url', 'fb_engagement', 'published_date', 'category'], inplace=True)

#Create a total column a concatatation of everything and a label finally pickle for order
buzz_feed_fakenews_df['total'] = buzz_feed_fakenews_df.values.sum(axis=1)
buzz_feed_fakenews_df['label'] = 1

#Conform to the kaggle dataset column ordering
_ = kaggle_fakenews_df.columns
buzz_feed_fakenews_df = buzz_feed_fakenews_df.reindex(columns=_)

buzz_feed_fakenews_df.to_pickle("data/prep_buzzfeed_fakenews.pkl")

- The articles text isn't in the dataset but the urls are. 
- I perform automatic webscraping using the urls. I save each article as one long string and place it on one line in the txt file. Each line is one article. I read all lines into a list and make a new column called text with this list of long strings
- Every observation in this dataset is a fake news article so I decided to just fill any nulls with empty strings to presevre it
- A new column called labels is of value 1
- Author was just called the website host due to the difficulty of scraping to get the author name for each website

# New York Times Articles

In [183]:
new_york_times_df['label'] = 0

new_york_times_df['total'] = new_york_times_df['total'].map(lambda x: x.replace('\n', ''))

new_york_times_df.to_pickle("data/prep_newyork_times.pkl")

The newyork times articles were in 2 list as reliable news source. I downloaded another dataset with these articles but it wasn't tricky to extract.

# Final Data Set

In [22]:
# Get all prep dataframes in a list and combine them
tmp = [pickle.load(open(os.path.join('data', 'prep_kaggle_fakenews_train.pkl'), 'rb')),
       pickle.load(open(os.path.join('data', 'prep_huff.pkl'), 'rb')),
       pickle.load(open(os.path.join('data', 'prep_buzzfeed_fakenews.pkl'), 'rb')),
       pickle.load(open(os.path.join('data', 'prep_newyork_times.pkl'), 'rb')),
      ]
final_df = pd.concat([tmp[0][['total', 'label']], tmp[1][['total', 'label']], tmp[2][['total', 'label']], tmp[3]])
final_df.to_pickle("data/prep_final_df.pkl")

In [None]:
# Removing stop words
kaggle_fakenews_df['total'].map(lambda x: x.removestopwords())

#Remove punctuations
kaggle_fakenews_df['total'].map(lambda x: x.removepunctionswords())

#Lower Case the words
kaggle_fakenews_df['total'].map(lambda x: x.removepunctionswords())






6. The New Yorker
1. The New York Times
2. The Wall Street Journal
5. The Economist
4. BBC
- huffpost
3. The Washington Post
9. The Atlantic

In [None]:
import requests
 headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
    }
requests.get('https://www.google.com/', headers=headers)