In [49]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from newspaper import Article
from newspaper import Config
import string
import re

ARTICLE_LIMIT = 10000
LEFT_LEANING = 0
RIGHT_LEANING = 1

def get_number_rating(vote):
    switcher = {
    "Positive": 1,
    "SomewhatPositive": 0.5,
    "Neutral": 0,
    "SomewhatNegative": -0.5,
    "Negative": -1
    }
    return switcher.get(vote, "N/A")

def clean(article):
    cleaned_article = re.sub('[\n\t,]', ' ', article)
    cleaned_article = cleaned_article.replace('Advertisement', ' ')
    return cleaned_article

def get_text(url):
    article_text = ''
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77'
    config = Config()
    config.browser_user_agent = user_agent
    article = Article(url, config=config)
    article.download()
    article.parse()
    article_text = article.text
    if article_text == '':
        raise Exception("Could not locate article body")
    
    cleaned_article_text = clean(article_text)
    return cleaned_article_text

def add_to_df(article, bias, dataframe):
    article_and_bias = [article, bias]
    row = pd.Series(article_and_bias, index=dataframe.columns)
    new_dataframe = dataframe.append(row, ignore_index=True)
    return new_dataframe

all_articles = pd.read_csv('newsArticlesWithLabels.tsv', sep='\t')

democrat_ratings = all_articles.loc[:ARTICLE_LIMIT, 'democrat.vote']
republican_ratings = all_articles.loc[:ARTICLE_LIMIT, 'republican.vote']
urls = all_articles.loc[:ARTICLE_LIMIT, 'url']

errors = []
article_bias = pd.DataFrame(columns=['article', 'bias'])

for i in range(len(urls)):
    try:
        diff = get_number_rating(democrat_ratings[i])-get_number_rating(republican_ratings[i])
        if  diff > 0:
            article_bias = add_to_df(get_text(urls[i]), LEFT_LEANING, article_bias)
            print(f"({len(article_bias)-1}) SUCCESS ({i})")
        elif diff < 0:
            article_bias = add_to_df(get_text(urls[i]), RIGHT_LEANING, article_bias)
            print(f"({len(article_bias)-1}) SUCCESS ({i})")
    except Exception as e: 
        print(f"FAILURE ({i})")
        errors.append(e)

article_bias.to_csv("binary_bias.csv")
article_bias

FAILURE (6)
(0) SUCCESS (12)
(1) SUCCESS (17)
(2) SUCCESS (18)
(3) SUCCESS (20)
(4) SUCCESS (21)
FAILURE (23)
(5) SUCCESS (24)
(6) SUCCESS (25)
(7) SUCCESS (26)
(8) SUCCESS (27)
FAILURE (29)
FAILURE (30)
(9) SUCCESS (31)
FAILURE (32)
FAILURE (33)
(10) SUCCESS (34)
(11) SUCCESS (36)
(12) SUCCESS (37)
(13) SUCCESS (38)
(14) SUCCESS (39)
(15) SUCCESS (40)
(16) SUCCESS (42)
(17) SUCCESS (43)
FAILURE (44)
(18) SUCCESS (45)
(19) SUCCESS (47)
(20) SUCCESS (48)
(21) SUCCESS (50)
FAILURE (51)
(22) SUCCESS (52)
(23) SUCCESS (53)
(24) SUCCESS (55)
(25) SUCCESS (56)
(26) SUCCESS (57)
(27) SUCCESS (60)
(28) SUCCESS (62)
(29) SUCCESS (64)
FAILURE (65)
FAILURE (69)
(30) SUCCESS (72)
(31) SUCCESS (73)
(32) SUCCESS (75)
FAILURE (77)
FAILURE (78)
(33) SUCCESS (79)
(34) SUCCESS (82)
FAILURE (83)
FAILURE (87)
FAILURE (89)
(35) SUCCESS (90)
(36) SUCCESS (92)
(37) SUCCESS (96)
(38) SUCCESS (97)
(39) SUCCESS (99)
(40) SUCCESS (102)
FAILURE (103)
(41) SUCCESS (104)
FAILURE (108)
(42) SUCCESS (111)
(43) SUCCES



(321) SUCCESS (831)
(322) SUCCESS (832)
FAILURE (833)
(323) SUCCESS (834)
(324) SUCCESS (835)
(325) SUCCESS (837)
(326) SUCCESS (838)
(327) SUCCESS (841)
FAILURE (842)
(328) SUCCESS (844)
FAILURE (846)
(329) SUCCESS (847)
FAILURE (848)
(330) SUCCESS (849)
(331) SUCCESS (850)
(332) SUCCESS (851)
(333) SUCCESS (852)
(334) SUCCESS (853)
(335) SUCCESS (854)
(336) SUCCESS (855)
(337) SUCCESS (856)
(338) SUCCESS (857)
(339) SUCCESS (858)
(340) SUCCESS (859)
FAILURE (860)
(341) SUCCESS (861)
(342) SUCCESS (862)
(343) SUCCESS (864)
(344) SUCCESS (865)
(345) SUCCESS (866)
FAILURE (867)
(346) SUCCESS (868)
(347) SUCCESS (869)
FAILURE (870)
(348) SUCCESS (871)
FAILURE (872)
(349) SUCCESS (874)
(350) SUCCESS (875)
(351) SUCCESS (876)
FAILURE (877)
(352) SUCCESS (878)
(353) SUCCESS (880)
FAILURE (881)
(354) SUCCESS (882)
(355) SUCCESS (883)
(356) SUCCESS (884)
(357) SUCCESS (885)
(358) SUCCESS (887)
(359) SUCCESS (888)
(360) SUCCESS (890)
(361) SUCCESS (891)
(362) SUCCESS (892)
(363) SUCCESS (893)




(579) SUCCESS (1214)
(580) SUCCESS (1216)
(581) SUCCESS (1217)
(582) SUCCESS (1218)
(583) SUCCESS (1219)
(584) SUCCESS (1220)
(585) SUCCESS (1221)
(586) SUCCESS (1224)
(587) SUCCESS (1225)
(588) SUCCESS (1226)
(589) SUCCESS (1227)
(590) SUCCESS (1231)
(591) SUCCESS (1232)
(592) SUCCESS (1233)
(593) SUCCESS (1237)
(594) SUCCESS (1238)
(595) SUCCESS (1240)
(596) SUCCESS (1241)
(597) SUCCESS (1242)
(598) SUCCESS (1255)
(599) SUCCESS (1256)
(600) SUCCESS (1257)
FAILURE (1258)
(601) SUCCESS (1259)
(602) SUCCESS (1260)
FAILURE (1261)
(603) SUCCESS (1265)
(604) SUCCESS (1269)
(605) SUCCESS (1273)
(606) SUCCESS (1274)
(607) SUCCESS (1276)
(608) SUCCESS (1286)
(609) SUCCESS (1292)
(610) SUCCESS (1300)
FAILURE (1303)
(611) SUCCESS (1304)
(612) SUCCESS (1305)
(613) SUCCESS (1307)
(614) SUCCESS (1314)
(615) SUCCESS (1315)
(616) SUCCESS (1320)
(617) SUCCESS (1327)
(618) SUCCESS (1328)
(619) SUCCESS (1333)
(620) SUCCESS (1337)
(621) SUCCESS (1347)
(622) SUCCESS (1348)
(623) SUCCESS (1349)
(624) SUCC



(763) SUCCESS (1676)
(764) SUCCESS (1680)
(765) SUCCESS (1683)
(766) SUCCESS (1684)
(767) SUCCESS (1685)
(768) SUCCESS (1688)
(769) SUCCESS (1693)
(770) SUCCESS (1694)
(771) SUCCESS (1696)
(772) SUCCESS (1697)
(773) SUCCESS (1698)
(774) SUCCESS (1699)
FAILURE (1700)
(775) SUCCESS (1701)
(776) SUCCESS (1702)
(777) SUCCESS (1704)
(778) SUCCESS (1705)
(779) SUCCESS (1706)
(780) SUCCESS (1707)
(781) SUCCESS (1709)
(782) SUCCESS (1711)
(783) SUCCESS (1712)
(784) SUCCESS (1713)
(785) SUCCESS (1717)
(786) SUCCESS (1718)
FAILURE (1719)
(787) SUCCESS (1720)
(788) SUCCESS (1723)
(789) SUCCESS (1724)
(790) SUCCESS (1725)
(791) SUCCESS (1727)
(792) SUCCESS (1733)
(793) SUCCESS (1735)
(794) SUCCESS (1739)
(795) SUCCESS (1740)
(796) SUCCESS (1741)
(797) SUCCESS (1742)
FAILURE (1743)
(798) SUCCESS (1746)
(799) SUCCESS (1747)
(800) SUCCESS (1750)
(801) SUCCESS (1752)
(802) SUCCESS (1753)
(803) SUCCESS (1754)
(804) SUCCESS (1759)
(805) SUCCESS (1760)
(806) SUCCESS (1762)
(807) SUCCESS (1767)
(808) SUCC

FAILURE (2469)
(1106) SUCCESS (2474)
(1107) SUCCESS (2475)
FAILURE (2479)
(1108) SUCCESS (2481)
(1109) SUCCESS (2490)
FAILURE (2491)
(1110) SUCCESS (2492)
(1111) SUCCESS (2494)
(1112) SUCCESS (2495)
(1113) SUCCESS (2496)
FAILURE (2498)
(1114) SUCCESS (2499)
(1115) SUCCESS (2502)
FAILURE (2504)
(1116) SUCCESS (2507)
(1117) SUCCESS (2511)
(1118) SUCCESS (2512)
(1119) SUCCESS (2515)
(1120) SUCCESS (2520)
(1121) SUCCESS (2523)
(1122) SUCCESS (2524)
(1123) SUCCESS (2526)
(1124) SUCCESS (2528)
(1125) SUCCESS (2530)
(1126) SUCCESS (2532)
(1127) SUCCESS (2533)
FAILURE (2534)
(1128) SUCCESS (2536)
(1129) SUCCESS (2538)
(1130) SUCCESS (2539)
(1131) SUCCESS (2540)
(1132) SUCCESS (2541)
(1133) SUCCESS (2542)
(1134) SUCCESS (2543)
(1135) SUCCESS (2544)
(1136) SUCCESS (2545)
(1137) SUCCESS (2547)
(1138) SUCCESS (2548)
(1139) SUCCESS (2549)
(1140) SUCCESS (2550)
(1141) SUCCESS (2551)
(1142) SUCCESS (2553)
(1143) SUCCESS (2555)
(1144) SUCCESS (2556)
FAILURE (2557)
(1145) SUCCESS (2558)
(1146) SUCCESS 

(1447) SUCCESS (3226)
FAILURE (3231)
(1448) SUCCESS (3236)
(1449) SUCCESS (3239)
FAILURE (3241)
(1450) SUCCESS (3247)
(1451) SUCCESS (3248)
(1452) SUCCESS (3249)
(1453) SUCCESS (3250)
(1454) SUCCESS (3251)
(1455) SUCCESS (3255)
FAILURE (3258)
FAILURE (3259)
(1456) SUCCESS (3260)
(1457) SUCCESS (3262)
(1458) SUCCESS (3269)
FAILURE (3270)
(1459) SUCCESS (3271)
(1460) SUCCESS (3273)
FAILURE (3275)
(1461) SUCCESS (3276)
(1462) SUCCESS (3277)
FAILURE (3279)
(1463) SUCCESS (3280)
(1464) SUCCESS (3282)
(1465) SUCCESS (3284)
(1466) SUCCESS (3290)
FAILURE (3292)
(1467) SUCCESS (3293)
(1468) SUCCESS (3294)
(1469) SUCCESS (3295)
FAILURE (3298)
(1470) SUCCESS (3299)
(1471) SUCCESS (3301)
FAILURE (3303)
FAILURE (3304)
(1472) SUCCESS (3305)
(1473) SUCCESS (3307)
FAILURE (3308)
FAILURE (3309)
(1474) SUCCESS (3310)
FAILURE (3311)
(1475) SUCCESS (3313)
(1476) SUCCESS (3314)
FAILURE (3316)
(1477) SUCCESS (3319)
FAILURE (3320)
FAILURE (3322)
(1478) SUCCESS (3323)
(1479) SUCCESS (3326)
(1480) SUCCESS (332



(1566) SUCCESS (3570)
(1567) SUCCESS (3572)
(1568) SUCCESS (3573)
(1569) SUCCESS (3574)
(1570) SUCCESS (3579)
(1571) SUCCESS (3583)
(1572) SUCCESS (3584)
(1573) SUCCESS (3585)
(1574) SUCCESS (3587)
(1575) SUCCESS (3588)
FAILURE (3590)
(1576) SUCCESS (3592)
(1577) SUCCESS (3593)
(1578) SUCCESS (3596)
(1579) SUCCESS (3597)
(1580) SUCCESS (3598)
(1581) SUCCESS (3599)
(1582) SUCCESS (3600)
FAILURE (3604)
(1583) SUCCESS (3607)
(1584) SUCCESS (3608)
FAILURE (3610)
(1585) SUCCESS (3612)
(1586) SUCCESS (3615)
(1587) SUCCESS (3616)
(1588) SUCCESS (3617)
(1589) SUCCESS (3618)
(1590) SUCCESS (3621)
(1591) SUCCESS (3629)
(1592) SUCCESS (3634)
(1593) SUCCESS (3637)
(1594) SUCCESS (3638)
(1595) SUCCESS (3640)
(1596) SUCCESS (3644)
(1597) SUCCESS (3645)
(1598) SUCCESS (3646)
(1599) SUCCESS (3648)
(1600) SUCCESS (3651)
(1601) SUCCESS (3653)
FAILURE (3655)
(1602) SUCCESS (3657)
(1603) SUCCESS (3658)
(1604) SUCCESS (3660)
(1605) SUCCESS (3662)
(1606) SUCCESS (3669)
(1607) SUCCESS (3670)
(1608) SUCCESS (

(1899) SUCCESS (4306)
(1900) SUCCESS (4307)
FAILURE (4308)
FAILURE (4317)
(1901) SUCCESS (4321)
FAILURE (4324)
(1902) SUCCESS (4327)
(1903) SUCCESS (4334)
(1904) SUCCESS (4337)
FAILURE (4338)
FAILURE (4339)
FAILURE (4355)
(1905) SUCCESS (4357)
(1906) SUCCESS (4358)
(1907) SUCCESS (4359)
(1908) SUCCESS (4362)
(1909) SUCCESS (4363)
(1910) SUCCESS (4364)
(1911) SUCCESS (4366)
(1912) SUCCESS (4372)
(1913) SUCCESS (4381)
(1914) SUCCESS (4382)
(1915) SUCCESS (4390)
(1916) SUCCESS (4392)
(1917) SUCCESS (4393)
FAILURE (4395)
(1918) SUCCESS (4399)
(1919) SUCCESS (4404)
(1920) SUCCESS (4408)
(1921) SUCCESS (4414)
(1922) SUCCESS (4417)
(1923) SUCCESS (4419)
FAILURE (4422)
(1924) SUCCESS (4425)
(1925) SUCCESS (4427)
(1926) SUCCESS (4437)
(1927) SUCCESS (4447)
(1928) SUCCESS (4452)
(1929) SUCCESS (4455)
(1930) SUCCESS (4456)
(1931) SUCCESS (4457)
(1932) SUCCESS (4458)
(1933) SUCCESS (4461)
FAILURE (4467)
(1934) SUCCESS (4468)
FAILURE (4483)
(1935) SUCCESS (4488)
(1936) SUCCESS (4499)
(1937) SUCCESS

(2219) SUCCESS (5505)
(2220) SUCCESS (5506)
(2221) SUCCESS (5512)
(2222) SUCCESS (5517)
(2223) SUCCESS (5518)
(2224) SUCCESS (5520)
FAILURE (5521)
(2225) SUCCESS (5530)
FAILURE (5534)
(2226) SUCCESS (5538)
FAILURE (5551)
(2227) SUCCESS (5557)
(2228) SUCCESS (5559)
(2229) SUCCESS (5563)
FAILURE (5570)
(2230) SUCCESS (5572)
(2231) SUCCESS (5574)
(2232) SUCCESS (5575)
(2233) SUCCESS (5582)
(2234) SUCCESS (5592)
FAILURE (5606)
(2235) SUCCESS (5612)
(2236) SUCCESS (5615)
(2237) SUCCESS (5616)
(2238) SUCCESS (5619)
(2239) SUCCESS (5621)
(2240) SUCCESS (5626)
(2241) SUCCESS (5628)
(2242) SUCCESS (5631)
FAILURE (5633)
(2243) SUCCESS (5636)
(2244) SUCCESS (5639)
(2245) SUCCESS (5644)
(2246) SUCCESS (5646)
(2247) SUCCESS (5649)
(2248) SUCCESS (5661)
(2249) SUCCESS (5666)
FAILURE (5678)
FAILURE (5679)
(2250) SUCCESS (5682)
(2251) SUCCESS (5686)
(2252) SUCCESS (5687)
(2253) SUCCESS (5690)
(2254) SUCCESS (5703)
(2255) SUCCESS (5704)
(2256) SUCCESS (5705)
FAILURE (5709)
FAILURE (5712)
(2257) SUCCESS

(2546) SUCCESS (6868)
(2547) SUCCESS (6869)
(2548) SUCCESS (6871)
FAILURE (6873)
FAILURE (6874)
FAILURE (6875)
(2549) SUCCESS (6876)
(2550) SUCCESS (6884)
(2551) SUCCESS (6889)
(2552) SUCCESS (6891)
(2553) SUCCESS (6892)
(2554) SUCCESS (6896)
(2555) SUCCESS (6898)
(2556) SUCCESS (6904)
(2557) SUCCESS (6906)
(2558) SUCCESS (6907)
(2559) SUCCESS (6911)
(2560) SUCCESS (6912)
(2561) SUCCESS (6913)
(2562) SUCCESS (6918)
(2563) SUCCESS (6923)
FAILURE (6926)
(2564) SUCCESS (6927)
(2565) SUCCESS (6929)
(2566) SUCCESS (6938)
(2567) SUCCESS (6943)
(2568) SUCCESS (6954)
(2569) SUCCESS (6956)
(2570) SUCCESS (6964)
(2571) SUCCESS (6967)
(2572) SUCCESS (6968)
(2573) SUCCESS (6993)
(2574) SUCCESS (6994)
(2575) SUCCESS (6998)
(2576) SUCCESS (7013)
(2577) SUCCESS (7015)
(2578) SUCCESS (7028)
(2579) SUCCESS (7051)
FAILURE (7052)
(2580) SUCCESS (7056)
(2581) SUCCESS (7058)
(2582) SUCCESS (7060)
(2583) SUCCESS (7062)
FAILURE (7063)
FAILURE (7064)
(2584) SUCCESS (7069)
(2585) SUCCESS (7072)
(2586) SUCCESS 



(2633) SUCCESS (7261)
(2634) SUCCESS (7263)
(2635) SUCCESS (7264)
(2636) SUCCESS (7265)
FAILURE (7266)
(2637) SUCCESS (7267)
FAILURE (7268)
(2638) SUCCESS (7269)
(2639) SUCCESS (7270)
FAILURE (7271)
FAILURE (7272)
(2640) SUCCESS (7273)
(2641) SUCCESS (7274)
(2642) SUCCESS (7276)
(2643) SUCCESS (7277)
(2644) SUCCESS (7278)
(2645) SUCCESS (7280)
FAILURE (7281)
FAILURE (7283)
(2646) SUCCESS (7284)
(2647) SUCCESS (7287)
FAILURE (7288)
(2648) SUCCESS (7289)
(2649) SUCCESS (7290)
(2650) SUCCESS (7291)
(2651) SUCCESS (7293)
(2652) SUCCESS (7294)
(2653) SUCCESS (7295)
(2654) SUCCESS (7296)
FAILURE (7297)
(2655) SUCCESS (7298)
(2656) SUCCESS (7299)
FAILURE (7300)
(2657) SUCCESS (7301)
(2658) SUCCESS (7302)
(2659) SUCCESS (7303)
(2660) SUCCESS (7304)
(2661) SUCCESS (7305)
(2662) SUCCESS (7306)
FAILURE (7307)
(2663) SUCCESS (7310)
(2664) SUCCESS (7312)
(2665) SUCCESS (7313)
(2666) SUCCESS (7315)
(2667) SUCCESS (7318)
FAILURE (7319)
(2668) SUCCESS (7321)
FAILURE (7322)
FAILURE (7324)
FAILURE (7326

(2945) SUCCESS (7742)
(2946) SUCCESS (7744)
(2947) SUCCESS (7745)
(2948) SUCCESS (7746)
FAILURE (7748)
(2949) SUCCESS (7749)
FAILURE (7750)
FAILURE (7751)
FAILURE (7752)
FAILURE (7753)
(2950) SUCCESS (7754)
(2951) SUCCESS (7755)
(2952) SUCCESS (7756)
(2953) SUCCESS (7757)
(2954) SUCCESS (7758)
(2955) SUCCESS (7759)
FAILURE (7761)
(2956) SUCCESS (7763)
(2957) SUCCESS (7764)
(2958) SUCCESS (7766)
(2959) SUCCESS (7767)
FAILURE (7768)
(2960) SUCCESS (7769)
(2961) SUCCESS (7770)
FAILURE (7771)
(2962) SUCCESS (7772)
(2963) SUCCESS (7773)
(2964) SUCCESS (7774)
(2965) SUCCESS (7775)
FAILURE (7776)
(2966) SUCCESS (7777)
FAILURE (7778)
(2967) SUCCESS (7779)
(2968) SUCCESS (7780)
(2969) SUCCESS (7781)
(2970) SUCCESS (7782)
(2971) SUCCESS (7783)
(2972) SUCCESS (7784)
(2973) SUCCESS (7787)
(2974) SUCCESS (7807)
(2975) SUCCESS (7833)
(2976) SUCCESS (7834)
FAILURE (7840)
(2977) SUCCESS (7841)
(2978) SUCCESS (7849)
(2979) SUCCESS (7861)
(2980) SUCCESS (7867)
(2981) SUCCESS (7886)
(2982) SUCCESS (7891)

(3263) SUCCESS (8882)
(3264) SUCCESS (8883)
FAILURE (8885)
(3265) SUCCESS (8893)
(3266) SUCCESS (8895)
(3267) SUCCESS (8900)
(3268) SUCCESS (8901)
(3269) SUCCESS (8903)
(3270) SUCCESS (8904)
(3271) SUCCESS (8905)
(3272) SUCCESS (8912)
FAILURE (8913)
(3273) SUCCESS (8915)
(3274) SUCCESS (8916)
FAILURE (8917)
FAILURE (8925)
FAILURE (8926)
(3275) SUCCESS (8927)
(3276) SUCCESS (8929)
FAILURE (8931)
(3277) SUCCESS (8938)
(3278) SUCCESS (8942)
FAILURE (8949)
FAILURE (8951)
(3279) SUCCESS (8954)
FAILURE (8959)
(3280) SUCCESS (8960)
(3281) SUCCESS (8962)
(3282) SUCCESS (8964)
(3283) SUCCESS (8969)
(3284) SUCCESS (8971)
(3285) SUCCESS (8972)
(3286) SUCCESS (8973)
FAILURE (8974)
(3287) SUCCESS (8980)
FAILURE (8981)
(3288) SUCCESS (8982)
(3289) SUCCESS (8986)
FAILURE (8988)
(3290) SUCCESS (8989)
(3291) SUCCESS (8991)
(3292) SUCCESS (8992)
(3293) SUCCESS (9003)
(3294) SUCCESS (9006)
(3295) SUCCESS (9007)
(3296) SUCCESS (9020)
(3297) SUCCESS (9021)
(3298) SUCCESS (9022)
(3299) SUCCESS (9023)
(3300)

(3591) SUCCESS (9937)
(3592) SUCCESS (9939)
(3593) SUCCESS (9940)
(3594) SUCCESS (9944)
FAILURE (9946)
(3595) SUCCESS (9947)
(3596) SUCCESS (9948)
(3597) SUCCESS (9949)
(3598) SUCCESS (9951)
(3599) SUCCESS (9952)
(3600) SUCCESS (9955)
(3601) SUCCESS (9956)
(3602) SUCCESS (9957)
(3603) SUCCESS (9958)
(3604) SUCCESS (9959)
(3605) SUCCESS (9963)
(3606) SUCCESS (9974)
(3607) SUCCESS (9976)
(3608) SUCCESS (9978)
(3609) SUCCESS (9979)
FAILURE (9980)
(3610) SUCCESS (9981)
(3611) SUCCESS (9984)
FAILURE (9994)
FAILURE (9995)


Unnamed: 0,article,bias
0,The following irresponsible police...,0
1,SACRAMENTO — “Living in parallel universes ” i...,1
2,"Co-host of MSNBC's ""The Cycle"" Touré joined Hu...",1
3,US Representative Michele Bachmann speaks duri...,1
4,The agency missed a Feb. 15 deadline to comple...,1
...,...,...
3607,Rep. Allyson Schwartz (D-PA.) honorary co-cha...,0
3608,Back to previous page Divided government requ...,1
3609,And Republican Florida Governor Rick Scott has...,0
3610,In less than two weeks a cleaver known as the...,0


In [50]:
binary_bias = pd.read_csv('binary_bias.csv')
binary_bias['article'][6]


'From the Media Research Center  ABC  CBS and NBC have so far refused to report the latest bombshell in the IRS scandal - a newly released list from the agency that showed it flagged political groups for "anti-Obama rhetoric." On September 18 USA Today  in a front page story  reported the following: "Newly uncovered IRS documents show the agency flagged political groups based on the content of their literature  raising concerns specifically about \'anti-Obama rhetoric \' inflammatory language and \'emotional\' statements made by non-profits seeking tax-exempt status."  Not only have ABC  CBS and NBC not reported this story they\'ve flat out stopped covering the IRS scandal on their evening and morning shows. It\'s been 85 days since ABC last touched the story on June 26. NBC hasn\'t done a report for 84 days and CBS last mentioned the IRS scandal 56 days ago on July 24.  The article by Gregory Korte went on to report: "The internal 2011 documents  obtained by USA TODAY  list 162 groups

In [42]:
len(article_bias)

40

In [19]:
from newspaper import Config

url = "http://online.wsj.com/news/articles/SB10001424052702304527504579169853503880212"
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'
config = Config()
config.browser_user_agent = user_agent

test = Article(url.strip())
test.download()
test.parse()
test.text

ArticleException: Article `download()` failed with 404 Client Error: Not Found for url: http://online.wsj.com/news/articles/SB10001424052702304527504579169853503880212 on URL http://online.wsj.com/news/articles/SB10001424052702304527504579169853503880212

In [34]:
url = "http://www.nytimes.com/2013/03/07/us/politics/cias-harsh-interrogations-pose-hurdles-for-john-brennan.html"

article = Article(url)
article.download()
article.parse()
article_text = article.text
cleaned_article_text = clean(article_text)
cleaned_article_text

'The agency missed a Feb. 15 deadline to complete a review of the report which has 35000 footnotes referring to 6 million documents from C.I.A. files. It now appears likely that the response offering the committee any factual corrections or broader judgments will be delayed until Mr. Brennan’s arrival.Because Mr. Obama famously said he preferred to look forward not back at his predecessor’s counterterrorism programs the Senate report is by far the most thorough examination of how the United States came to use nudity cold sleep deprivation stress positions wall-slamming and waterboarding methods it had long condemned as abuse or torture.Mr. Brennan will have to decide whether to support making a redacted version of the interrogation report public as the committee is likely to support after the C.I.A. completes its review and as a United Nations human rights adviser urged this week. Several Democratic senators and at least one Republican Senator John McCain of Arizona who was tortured as

In [36]:

URLs = []
for column in all_articles:
    URLs.append(all_articles[column][23])
URLs

['http://video.foxnews.com/v/2800623370001/are-media-casting-obama-as-disengaged/',
 'Opinion',
 0,
 'Democrat Scandals',
 'Civil Rights',
 'Negative',
 'SomewhatNegative']

In [36]:
errors

[newspaper.article.ArticleException('Article `download()` failed with 404 Client Error: Not Found for url: https://www.foxnews.com/politics/2013/10/14/amid-cuts-to-federal-courts-judge-suggests-congress-go-to-hell/ on URL http://www.foxnews.com/politics/2013/10/14/amid-cuts-to-federal-courts-judge-suggests-congress-go-to-hell/'),
 Exception('Could not locate article body'),
 Exception('Could not locate article body'),
 newspaper.article.ArticleException('Article `download()` failed with 404 Client Error: Not Found for url: https://www.breitbart.com/politics/2013/05/21/sarah-palin-going-rogue-in-wrong-direction on URL http://www.breitbart.com/Big-Government/2013/05/21/sarah-palin-going-rogue-in-wrong-direction'),
 newspaper.article.ArticleException('Article `download()` failed with 404 Client Error: Not Found for url: https://www.reuters.com/article/2013/06/11/us-usa-security-snowden-idUSBRE95A0TH20130611 on URL http://www.reuters.com/article/2013/06/11/us-usa-security-snowden-idUSBRE95