In [4]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from newspaper import Article
import string
import re

ARTICLE_LIMIT = 2000
LEFT_LEANING = 0
RIGHT_LEANING = 1

def get_number_rating(vote):
    switcher = {
    "Positive": 1,
    "SomewhatPositive": 0.5,
    "Neutral": 0,
    "SomewhatNegative": -0.5,
    "Negative": -1
    }
    return switcher.get(vote, "N/A")

def clean(article):
    cleaned_article = re.sub('[\n\t,]', '', article)
    return cleaned_article

def get_text(url):
    article = Article(url)
    article.download()
    article.parse()
    article_text = article.text
    if article_text == '':
        raise Exception("Could not locate article body")
    cleaned_article_text = clean(article_text)
    return cleaned_article_text

def add_to_df(article, bias, dataframe):
    article_and_bias = [article, bias]
    row = pd.Series(article_and_bias, index=dataframe.columns)
    new_dataframe = dataframe.append(row, ignore_index=True)
    return new_dataframe

all_articles = pd.read_csv('newsArticlesWithLabels.tsv', sep='\t')

democrat_ratings = all_articles.loc[:ARTICLE_LIMIT, 'democrat.vote']
republican_ratings = all_articles.loc[:ARTICLE_LIMIT, 'republican.vote']
urls = all_articles.loc[:ARTICLE_LIMIT, 'url']

errors = []
article_bias = pd.DataFrame(columns=['article', 'bias'])

for i in range(len(urls)):
    try:
        diff = get_number_rating(democrat_ratings[i])-get_number_rating(republican_ratings[i])
        if  diff > 0:
            article_bias = add_to_df(get_text(urls[i]), LEFT_LEANING, article_bias)
            print(f"SUCCESS ({i})")
        elif diff < 0:
            article_bias = add_to_df(get_text(urls[i]), RIGHT_LEANING, article_bias)
            print(f"SUCCESS ({i})")
    except Exception as e: 
        print(f"FAILURE ({i})")
        errors.append(e)

article_bias.to_csv("binary_bias.csv")
article_bias



['The following irresponsible police action reported by KHOU’s Rucks Russell should scare many. Here is the story as reported.A young 13 year old dancer Landry Thompson came to Houston from Oklahoma. She travelled with her two dance instructors over the weekend. The intent of their visit was to train all weekend with some of the best in dance industry.When they left the studio Saturday night they stopped at a gas station very exhausted as they searched for their lodging. They were trying to locate their hotel on the GPS. Out of nowhere they were surrounded by the Houston police.The Houston police dragged them out of the car and handcuffed them all.“I was kind of freaked out and surprised by it” said Landry Thompson.“They just pulled us out of the car. Put our hands behind out backs like we were criminals” said Landry’s dance instructor Emmanuel Hurd. “He asked me; who is the girl? She is my student. I said I have a notarized letter from her parents stating that I have full guardianship

In [2]:
binary_bias = pd.read_csv('binary_bias.csv')
binary_bias


Unnamed: 0.1,Unnamed: 0,article,bias
0,0,The following irresponsible police action repo...,0
1,1,SACRAMENTO — “Living in parallel universes” is...,1
2,2,"Co-host of MSNBC's ""The Cycle"" Touré joined Hu...",1
3,3,US Representative Michele Bachmann speaks duri...,1
4,4,The agency missed a Feb. 15 deadline to comple...,1
...,...,...,...
806,806,Olga Rudenko and Jesse SingalSpecial to USA TO...,0
807,807,Susan Page USA TODAYAmericans overwhelmingly s...,1
808,808,Story highlights Top diplomatic economic offic...,0
809,809,next Image 1 of 2prev Image 2 of 2A Republican...,1


In [80]:
len(article_bias)

823

In [19]:
from newspaper import Config

url = "http://online.wsj.com/news/articles/SB10001424052702304527504579169853503880212"
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'
config = Config()
config.browser_user_agent = user_agent

test = Article(url.strip())
test.download()
test.parse()
test.text

ArticleException: Article `download()` failed with 404 Client Error: Not Found for url: http://online.wsj.com/news/articles/SB10001424052702304527504579169853503880212 on URL http://online.wsj.com/news/articles/SB10001424052702304527504579169853503880212

In [34]:
url = "http://www.nytimes.com/2013/03/07/us/politics/cias-harsh-interrogations-pose-hurdles-for-john-brennan.html"

article = Article(url)
article.download()
article.parse()
article_text = article.text
cleaned_article_text = clean(article_text)
cleaned_article_text

'The agency missed a Feb. 15 deadline to complete a review of the report which has 35000 footnotes referring to 6 million documents from C.I.A. files. It now appears likely that the response offering the committee any factual corrections or broader judgments will be delayed until Mr. Brennan’s arrival.Because Mr. Obama famously said he preferred to look forward not back at his predecessor’s counterterrorism programs the Senate report is by far the most thorough examination of how the United States came to use nudity cold sleep deprivation stress positions wall-slamming and waterboarding methods it had long condemned as abuse or torture.Mr. Brennan will have to decide whether to support making a redacted version of the interrogation report public as the committee is likely to support after the C.I.A. completes its review and as a United Nations human rights adviser urged this week. Several Democratic senators and at least one Republican Senator John McCain of Arizona who was tortured as

In [36]:

URLs = []
for column in all_articles:
    URLs.append(all_articles[column][23])
URLs

['http://video.foxnews.com/v/2800623370001/are-media-casting-obama-as-disengaged/',
 'Opinion',
 0,
 'Democrat Scandals',
 'Civil Rights',
 'Negative',
 'SomewhatNegative']