In [9]:
class Content:
    '''
    글/페이지 전체에 사용할 기반 클래스
    '''
    
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body
    
    def print(self):
        '''
        출력 결과를 원하는 대로 바꿀 수 있는 함수
        '''
        print("URL: {}".format(self.url))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))

In [10]:
class Website:
    '''
    웹사이트 구조에 관한 정보를 저장할 클래스
    '''
    
    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [11]:
import requests
from bs4 import BeautifulSoup

In [12]:
class Crawler:
    
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')
    
    def safeGet(self, pageObj, selector):
        '''
        beautifulsoup 객체와 선택자를 받아 콘텐츠 문자열을 추출하는 함수
        주어진 선택자로 검색된 결과가 없다면 빈 문자열 반환
        '''
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''
    
    def parse(self, site, url):
        '''
        url을 받아 콘텐츠를 추출
        '''
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

In [13]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'h1', 'div.post-body'],
    ['New York Times', 'http://nytimes.com', 'h1', 'div.StoryBodyCompanionColumn div p']
]

websites = []
for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3]))

crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')
crawler.parse(
    websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')
crawler.parse(
    websites[2],
    'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')
crawler.parse(
    websites[3], 
    'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')


URL: https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/
TITLE: Idea to Retire: Old methods of policy education
Idea to Retire: Old methods of policy education
BODY:

Public policy and public affairs schools aim to train competent creators and implementers of government policy. While drawing on the principles that gird our economic and political systems to provide a well-rounded education, like law schools and business schools, policy schools provide professional training. They are quite distinct from graduate programs in political science or economics which aim to train the next generation of academics. As professional training programs, they add value by imparting both the skills which are relevant to current employers, and skills which we know will be relevant as organizations and societies evolve. 
The relevance of the skills that policy programs impart to address problems of today and tomorrow bears further discussion. We are living t

URL: https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html
TITLE: Oil Boom Gives the U.S. a New Edge in Energy and Diplomacy
BODY:
HOUSTON — A substantial rise in oil prices in recent months has led to a resurgence in American oil production, enabling the country to challenge the dominance of Saudi Arabia and dampen price pressures at the pump.
The success has come in the face of efforts by Saudi Arabia and its oil allies to undercut the shale drilling spree in the United States. Those strategies backfired and ultimately ended up benefiting the oil industry.
Overcoming three years of slumping prices proved the resiliency of the shale boom. Energy companies and their financial backers were able to weather market turmoil — and the maneuvers of the global oil cartel — by adjusting exploration and extraction techniques.
After a painful shakeout in the industry that included scores of bankruptcies and a significant loss of jobs, a steadier shale-drilling industry is a

## 검색을 통한 사이트 크롤링

In [10]:
class Content:
    
    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url
    
    def print(self):
        print("New article found for topic: {}".format(self.topic))
        print("URL: {}".format(self.url))
        print("TITLE: {}".format(self.title))
        print("BODY: {}".format(self.body))

In [11]:
class Website:
    
    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl  # url에 검색어를 추가한 경우 검색 결과를 어디에서 얻는지 정의
        self.resultListing = resultListing   # 각 결과에 관한 정보를 담고 있는 박스
        self.resultUrl = resultUrl  # 결과에서 정확한 url을 추출할 때 사용할 태그 정보
        self.absoluteUrl = absoluteUrl  # 절대 url인지 상대 url인지 알려주는 불리언
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [12]:
import requests
from bs4 import BeautifulSoup

class Crawler:
    
    def getPage(self, url):
        try:
            req = requests.get(url)
        except:
            return None
        return BeautifulSoup(req.text, 'html.parser')
    
    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ''
    
    def search(self, topic, site):
        '''
        주어진 검색어로 주어진 웹사이트를 검색해 결과 페이지를 모두 기록합니다.
        '''
        
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            
            if (site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            
            if bs is None:
                print('Something was wrong with that page or URL. Skipping!')
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()

In [13]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',
        'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',
        'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']
]

sites = []
for row in siteData:
    sites.append(Website(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7]))

topics = ['python', 'data science']
for topic in topics:
    # 먼저 검색어를 정하고 해당 검색어에 대해 모든 사이트를 반복함: 서버 부하를 줄이기 위해 
    print('GETTING INFO ABOUT: ' + topic )
    for targetSite in sites:
        crawler.search(topic, targetSite)

GETTING INFO ABOUT: python
New article found for topic: python
URL: Leveraging the disruptive power of artificial intelligence for fairer opportunities
TITLE: 
According to President Obama’s Council of Economic Advisers (CEA), approximately 3.1 million jobs will be rendered obsolete or permanently altered as a consequence of artificial intelligence technologies. Artificial intelligence (AI) will, for the foreseeable future, have a significant disruptive impact on jobs. That said, this disruption can create new opportunities if policymakers choose to harness them—including some with the potential to help address long-standing social inequities. Investing in quality training programs that deliver premium skills, such as computational analysis and cognitive thinking, provides a real opportunity to leverage AI’s disruptive power.







Makada Henry-Nickie

					Fellow - Governance Studies 

 Twitter
mhnickie





AI’s disruption presents a clear challenge: competition to traditional skill

New article found for topic: python
URL: The Hutchins Center Explains: Budgeting for aging America
TITLE: 


For decades, we have been hearing that the baby-boom generation was like a pig moving through a python–bigger than the generations before and after. 
That’s true. But that’s also a very misleading metaphor for understanding the demographic forces that are driving up federal spending: They aren’t temporary. The generation born between 1946 and 1964 is the beginning of a demographic transition that will persist for decades after the baby boomers die, the consequence of lengthening lifespans and declining fertility. Putting the federal budget on a sustainable course requires long-lasting fixes, not short-lived tweaks.  
First, a few demographic facts.
As the chart below illustrates, there was a surge in births in the U.S. at the end of World War II, a subsequent decline, and then an uptick as baby boomers began having children.




Although the population has been rising, the numbe

New article found for topic: python
URL: Skills, success, and why your choice of college matters
TITLE: 


Amidst growing frustration with the cost of higher education, complaints also abound about its quality. One critique, launched in the book Academically Adrift by two sociologists, finds little evidence that college students score better on measures of critical thinking, writing, and reasoning after attending college. This is something of a paradox, since strong evidence shows that attending college tends to raise earnings power, even for students who start with mediocre preparation. 
Our recent study uses a different approach to assess the value of a college education. We find that the particular skills listed by a college’s alumni on their resumes predict how well graduates from those schools perform in terms of earning a living, meeting debt obligations, and working for high-paying or innovative companies. Since jobs requiring more valuable skills typically require at least some

New article found for topic: python
URL: An Atlanta organization’s mission to bring racial equity to the tech ecosystem
TITLE: 

Summary
Between the COVID-19 pandemic and the tragic death of George Floyd, the country’s ongoing crisis of racism has come into stark relief. Black Americans are disproportionately diagnosed with or dying from COVID-19 due to structural conditions, while also facing major economic risks as the racial unemployment gap between white and Black populations is the widest it’s been in five years. At the same time, Black people are still vulnerable to police violence that too often occurs without consequences. While there is a great deal of work to be done to dismantle structural racism, it is imperative to use this moment to remove racial barriers and invest in long-term prosperity for Black people, enterprises, and communities.







Reniya Dinkins

					Former Senior Research Assistant - Metropolitan Policy Program 

 Twitter
reniyasdinkins








Sifan Liu

	

New article found for topic: python
URL: Inside the Pentagon’s Secret Afghan Spy Machine
TITLE: 
The Pentagon’s top researchers have rushed a classified and controversial intelligence program into Afghanistan. Known as “Nexus 7,” and previously undisclosed as a war-zone surveillance effort, it ties together everything from spy radars to fruit prices in order to glean clues about Afghan instability.
The program has been pushed hard by the leadership of the Defense Advanced Research Projects Agency (DARPA). They see Nexus 7 as both a breakthrough data-analysis tool and an opportunity to move beyond its traditional, long-range research role and into a more active wartime mission. 
But those efforts are drawing fire from some frontline intel operators who see Nexus 7 as little more than a glorified grad-school project, wasting tens of millions on duplicative technology that has nothing to do with stopping the Taliban. 
“There are no models and there are no algorithms,” says one person fami

New article found for topic: python
URL: Appointments Apocalypse
TITLE: 
Anyone who doubts that the presidential appointments process is on the verge of collapse need only look at three recent events.
On April 30, President Bush’s 101st afternoon in office, the White House dumped 61 names into the Senate confirmation process in a desperate effort to beat the Clinton administration’s dismal mark after its 100th day. Despite smashing the single-day nomination record, Bush had nominated less than 30 percent of the candidates for sub-Cabinet posts by the end of that week.
On May 2, Senate Democrats announced that they were delaying a vote on two Justice Department nominees to express their anger over a change in the process that gives home-state Senators a say about federal judicial nominees. Not to be outdone, Republicans followed suit by placing holds on four Defense nominees to remind Secretary Donald Rumsfeld that he should communicate more frequently with the chamber.
On May 4, the ad

New article found for topic: python
URL: Think Bigger on North Korea
TITLE: 
While the world is fixated on Iraq and the Middle East, North Korea continues to pose at least as great a threat to Western security interests. Six-party talks with the North Koreans in Beijing have just showed that the Bush administration hasn’t yet found a way out of the nuclear crisis. Although negotiations appear likely to resume in a couple of months, their prospects for success seem poor.
The basic dilemma is easy to understand. North Korea will not surrender its nuclear capabilities, which are among its only valuable national assets, unless offered a very good deal for giving them up. President Bush refuses to offer such a deal because he sees the North Korean demand as blackmail. He insists that before any talks about better diplomatic relations or economic interaction occur, North Korea first relinquish—with verification—a nuclear program it had pledged nine years ago to abandon completely. At most, B

New article found for topic: data science
URL: What all policy analysts need to know about data science
TITLE: 







Alex Engler

					Rubenstein Fellow - Governance Studies 

 Twitter
@AlexCEngler





Conversations around data science typically contain a lot of buzzwords and broad generalizations that make it difficult to understand its pertinence to governance and policy. Even when well-articulated, the private sector applications of data science can sound quite alien to public servants. This is understandable, as the problems that Netflix and Google strive to solve are very different than those government agencies, think tanks, and nonprofit service providers are focused on. This does not mean, however, that there is no public sector value in the modern field of data science. With qualifications, data science offers a powerful framework to expand our evidence-based understanding of policy choices, as well as directly improve service delivery.
To better understand its importance t

New article found for topic: data science
URL: Measuring racism and discrimination in economic data
TITLE: 
Although researchers in economics are increasingly cognizant that race and ethnicity are key determinants of economic outcomes, credibly assessing potential causes and identifying solutions is often complicated by the lack of high-quality data. The typical economist’s work primarily focuses on proposing relationships and testing for causal mechanisms across a broad set of economic phenomena. The study of race and the consequences of race in market interactions have long been hampered by the relative lack of longitudinal data collected on relevant markers of discrimination, racism, and related long-term outcomes.







Randall Akee

					Nonresident Fellow - Economic Studies, Center on Children and Families, Future of the Middle Class Initiative 

					Former Brookings Rubenstein Fellow										

 Twitter
indigenalysis








Marcus Casey

					Nonresident Fellow - Economic Stu

New article found for topic: data science
URL: Bridging the gender data gap
TITLE: 
More men than women are killed in car crashes each year, partly because men drive more and engage in riskier driving behavior. On the other hand, women are 17% more likely to be killed and 47% more likely to be injured in crashes than men are. Women are at increased risk simply because they are women: cars are primarily designed, built, and tested by male engineers using male data, so they are built with men in mind. Scaled-down versions of male crash test dummies, meant to represent women, were not used until 2003—and are primarily tested in the passenger seat. In car design, development, and testing, male bodies are the standard and female bodies the outlier. This creates a gender data gap with very real impacts on the lives of Americans.





J



Jeanette Gaudry Haynie

					Founder and Executive Director - Athena Leadership Project 

					Lieutenant Colonel - U.S. Marine Corps Reserve 




The gend

IndexError: list index out of range

## 링크를 통한 사이트 크롤링

In [49]:
class Website:
    def __init__(self, name, rootUrl, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.rootUrl = rootUrl
        self.url = url
        self.targetPattern = targetPattern
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body
    
    def print(self):
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY: {}'.format(self.body))

In [57]:
import re

class Crawler:
    def __init__(self, site):
        self.site = site
        self.visited = []
    
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')
    
    def safeGet(self, pageObj, selector):
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''
    
    def parse(self, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, self.site.titleTag)
            body = self.safeGet(bs, self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()
            else:
                print("Cannot title or body")
    
    def crawl(self):
        '''
        사이트 홈페이지에서 페이지를 가져옵니다.
        '''
        bs = self.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        
        for targetPage in targetPages:
            targetPage = targetPage.attrs['href']
            if targetPage not in self.visited:
                self.visited.append(targetPage)
                if not self.site.absoluteUrl:
                    targetPage = '{}{}'.format(self.site.rootUrl, targetPage)
                self.parse(targetPage)

In [58]:
reuters = Website('Reuters', 'https://www.reuters.com', 'https://www.reuters.com/world', '^(/world/)', False, 'h1',
                 'div.paywall-article>p')
crawler = Crawler(reuters)
crawler.crawl()

Cannot title or body
URL: https://www.reuters.com/world/asia-pacific/taliban-claims-control-key-afghan-border-crossing-with-pakistan-2021-07-14/
TITLE: Taliban claims to control key Afghan border crossing with Pakistan
BODY: KABUL, July 14 (Reuters) - Taliban fighters on Wednesday said they have gained control over an important trade route linking Pakistan with southern Afghanistan, hours after Afghan forces surrendered the critical transit point to the Islamist hardline insurgent group.
Afghan government officials were not immediately available to confirm the fall of a town in Spin Boldak district in Kandahar province situated next to the Durand Line border with Pakistan.
"The Mujahideen have captured an important border town called Wesh in Kandahar, " said Zabihullah Mujahid, a Taliban spokesperson in a statement.
"With this, the important road between (Spin) Boldak and Chaman and Kandahar customs have come under Mujahideen control," he said.
Afghan government data shows about 900 tr

URL: https://www.reuters.com/world/americas/cuba-curbs-access-facebook-messaging-apps-amid-protests-internet-watchdog-2021-07-13/
TITLE: Faced with rare protests, Cuba curbs social media access, watchdog says
BODY: HAVANA, July 13 (Reuters) - Cuba has restricted access to social media and messaging platforms including Facebook and WhatsApp, global internet monitoring firm NetBlocks said on Tuesday, in the wake of the biggest anti-government protests in decades.
Thousands of Cubans joined demonstrations throughout the Communist-run country on Sunday to protest against a deep economic crisis that has seen shortages of basic goods and power outages. They were also protesting against the government's handling of the coronavirus pandemic and curbs on civil liberties.
Cuba's government has said the demonstrations were orchestrated by counter-revolutionaries financed by the United States, manipulating frustration with an economic crisis largely caused by the decades-old U.S. trade embargo.
Th

URL: https://www.reuters.com/world/hamas-radio-reports-israeli-air-strike-gaza-2021-06-15/
TITLE: In first for Gulf, UAE opens embassy in Israel, hails trade ties
BODY: TEL AVIV, July 14 (Reuters) - The United Arab Emirates on Wednesday become the first Gulf state to open an embassy in Israel, as its envoy hailed the trade and investment opportunities that closer ties would bring at a flag-raising ceremony also attended by Israel's president.
Brought together by shared unease about Iran, the UAE and Bahrain normalised relations with Israel last year under the "Abraham Accords" crafted by the administration of then-U.S. President Donald Trump
Sudan and Morocco have since also moved to establish ties with Israel.
The opening of the UAE embassy, which is situated in the Tel Aviv Stock Exchange, followed the inauguration of Israel's embassy in the UAE last month.
"Since the normalisation of ties ...we have seen for the first time discussions on trade and investment opportunities," UAE amba

URL: https://www.reuters.com/world/china/death-toll-rises-17-partial-collapse-china-hotel-2021-07-14/
TITLE: Death toll rises to 17 in partial collapse of China hotel
BODY: BEIJING, July 14 (Reuters) - The death toll in the partial collapse of a budget hotel in China's eastern city of Suzhou on Monday has risen to 17, state media said on Wednesday.
Of the 23 people trapped in the rubble of the Siji Kaiyuan Hotel in Suzhou only six survived, according to state media.
More than 650 people were involved in the search and rescue operation, which concluded on Wednesday morning.
The provincial government of Jiangsu has set up a team to conduct an in-depth investigation on the specific causes of the collapse.
The Siji Kaiyuan opened in 2018 and had a total of 54 rooms, according to online booking sites. The part of the hotel that collapsed was three storeys tall.
According to some state media reports, the collapse was initially determined to be caused by the property owner's alteration of the

URL: https://www.reuters.com/world/asia-pacific/responding-sos-afghan-commandos-caught-fierce-taliban-attack-2021-07-13/
TITLE: Responding to SOS, Afghan commandos caught in fierce Taliban attack
BODY: KANDAHAR, Afghanistan, July 13 (Reuters) - Minutes after returning from a mission on Tuesday before dawn, a convoy of exhausted Afghan commandos were speeding back out of their base to try to extract a wounded policeman trapped by Taliban insurgents on the outskirts of Kandahar.
The previous outing had been tense but quiet. This operation in the southern city, a Taliban stronghold before the movement was ousted from power in 2001, was anything but.
As they approached the checkpoint where policeman Ahmad Shah had been holed up alone for 18 hours, some 30-40 special forces soldiers in a line of Humvees came under automatic weapons fire, according to a Reuters reporter travelling with them.
A gun battle erupted as the convoy forced its way to Shah's position, and he was hurriedly loaded int

URL: https://www.reuters.com/world/asia-pacific/vietnam-operations-footwear-giant-pou-chen-hit-by-covid-19-curbs-2021-07-14/
TITLE: Vietnam operations of footwear giant Pou Chen hit by COVID-19 curbs
BODY: HANOI, July 14 (Reuters) - Taiwan's Pou Chen Corp (9904.TW), the world's largest manufacturer of branded athletic and casual footwear, suspended on Wednesday operations at its plant in Ho Chi Minh City as COVID-19 curbs hit factories in the country's business hub.
Production at its Pouyuen Vietnam business in Ho Chi Minh City, the epicentre of the country's worst coronavirus outbreak, will be suspended for 10 days, the health ministry said in a statement.
After successfully containing the disease for much of the pandemic, Vietnam has faced a more stubborn outbreak since late April, with daily infections climbing to record levels.
Companies in Vietnam's business hub and its neighbouring industrial provinces have been struggling to keep running, after strict movement curbs were imposed