# Scraping News articles for Floods and Droughts in Brazil

## Install Libraries

In [1]:
!pip install pygooglenews
!pip install newspaper3k



## Import libraries

In [2]:
from pygooglenews import GoogleNews
from newspaper import Article
import pandas as pd

## Inputs to Google News

These are the details about flood in Brazil corresponding to the Glide id at: 

`Official WEO(...)/task7(...)/Data/glide_historical_events.csv`

For Brazil, the following IDs where found:
- FL-2017-000067-BRA
- FL-2015-000092-BRA

Also available:
- FL-2014-000083-BRA
- FL-2013-000157-BRA
- DR-2012-000074-BRA
- FL-2012-000003-BRA
- FL-2011-000102-BRA

We will be searching for articles in Portuguese and English languages.

In [3]:
language = 'en'
# language = 'pt'
country = 'Brazil'

search_terms = 'Flood Brazil'
start_date = '2015-01-01'
end_date = '2021-05-22'

In [4]:
gn = GoogleNews(lang = language, country = country)

In [5]:
# function to get the article content and other details from article url
def get_article_content(article_url):
    # get the date of posting the article
    try:
        news_article = Article(article_url,language='en')
        news_article.download()
        news_article.parse()
        news_article.nlp()
    except:
        pass

    # get the author
    #print(news_article.authors)

    # get the publish date
    #print(news_article.publish_date)

    #get top image
    #print(news_article.top_image)

    #get a summary of article
    #print(news_article.summary)

    # get article keywords
    #print(news_article.keywords)

    #get the article text
    return news_article.text,news_article.top_image,news_article.keywords

In [6]:
# function to get top 50 new links
def get_news_links(search_terms,start_date,end_date,event_id):
    article_num =[]
    article_title = []
    article_link = []
    article_content = []
    publishing_date = []
    article_image = []
    article_keywords =[]
    article_relevance = []
    search = gn.search(query=search_terms, helper = True, when = None, from_ = start_date, to_ = end_date , proxies=None, scraping_bee=None)
    count = 0
    for item in search['entries']:
        print(count)
        article_num.append(count)
        count += 1
        article_title.append(item['title'])
        article_link.append(item['link'])
        article_content.append(get_article_content(item['link'])[0])
        publishing_date.append(item['published'])
        article_image.append(get_article_content(item['link'])[1])
        article_keywords.append(get_article_content(item['link'])[2])
        
    event_id_list = [event_id]*count
    article_relevance_list = [' ']*count
    articles_dict = {'ID': article_num,'event_id': event_id_list,'article_title':article_title,'article_link':article_link,'article_content':article_content,'publishing_date':publishing_date,'article_image':article_image,'article_keywords':article_keywords,'article_relevance':article_relevance_list}
    print(len(article_num))
    print(len(article_title))
    print(len(article_link))
    print(len(article_content))
    print(len(publishing_date))
    print(len(article_image))
    print(len(article_keywords))
    print(len(article_relevance))
    df = pd.DataFrame(articles_dict)
    #news_results_list = zip(article_num,article_title,article_link,article_content,publishing_date,article_image,article_keywords,article_relevance)
    return df

In [7]:
# searching for disaster events in news articles
# event_id is the glide id for which we are scraping the news_articles

ids_list = ['FL-2017-000067-BRA', 'FL-2015-000092-BRA']
#, 'FL-2014-000083-BRA', 'FL-2013-000157-BRA', 'DR-2012-000074-BRA', 'FL-2012-000003-BRA', 'FL-2011-000102-BRA']

news_results_list = []
for item in ids_list:
    news_results_list.append(get_news_links(search_terms,start_date,end_date, item))
    
 

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66




67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
100
100
100
100
100
100
0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
100
100
100
100
100
100
0


In [16]:
df_news_results = pd.concat(news_results_list)

df_news_results.to_csv('FL-DR-Brazil-EN.csv')

In [17]:
df_news_results

Unnamed: 0,ID,event_id,article_title,article_link,article_content,publishing_date,article_image,article_keywords,article_relevance
0,0,FL-2017-000067-BRA,Brazil's pandemic-weary Manaus flooded by risi...,https://www.reuters.com/world/americas/brazils...,Heavy rains in the Amazon rainforest have caus...,"Tue, 18 May 2021 08:00:00 GMT",https://www.reuters.com/resizer/6DCOlzR1Ux_O8W...,"[amazon, wooden, rivers, water, levels, negro,...",
1,1,FL-2017-000067-BRA,'Amazon Venice' struggles to stay above water ...,https://www.dailysabah.com/gallery/amazon-veni...,The rivers have been swelling for weeks in Bra...,"Sat, 15 May 2021 10:44:00 GMT",https://idsb.tmgrup.com.tr/ly/uploads/images/2...,"[amazon, rivers, water, brazil, region, floods...",
2,2,FL-2017-000067-BRA,Drone captures devastating floods in Brazil - ...,https://news.yahoo.com/drone-captures-devastat...,The Telegraph\n\nSome teenagers and young adul...,"Tue, 18 May 2021 03:20:37 GMT",https://s.yimg.com/hd/cp-video-transcode/prod/...,"[vaccine, captures, devastating, brazil, cdc, ...",
3,3,FL-2017-000067-BRA,In pictures: Rising Amazon rivers flood Covid-...,https://news.yahoo.com/pictures-rising-amazon-...,A girl paddles her canoe through a street floo...,"Tue, 18 May 2021 12:19:19 GMT",https://s.yimg.com/uu/api/res/1.2/EcxuXdh6TY8z...,"[amazon, wooden, rivers, negro, brazil, floode...",
4,4,FL-2017-000067-BRA,Towns flood as Brazil's Amazon River rises to ...,https://newsus.cgtn.com/news/2021-05-20/Towns-...,"Explore the small Brazil town of Anama, now ha...","Wed, 19 May 2021 23:05:41 GMT",https://videous.cgtn.com/news/2021-05-20/Towns...,"[levels, amazon, threatens, water, rises, wors...",
...,...,...,...,...,...,...,...,...,...
95,95,FL-2015-000092-BRA,"01/21/2021 - Palm oil plantations, coal mines ...",https://news.mongabay.com/2021/01/palm-oil-pla...,Environmentalists have attributed recent heavy...,"Wed, 20 Jan 2021 08:00:00 GMT",https://imgs.mongabay.com/wp-content/uploads/s...,"[palm, south, watershed, hectares, mines, floo...",
96,96,FL-2015-000092-BRA,Why is Brazil’s president beaming? Putin prais...,https://www.scmp.com/news/world/russia-central...,Russia's President Vladimir Putin with Brazil'...,"Thu, 19 Nov 2020 08:00:00 GMT",https://cdn.i-scmp.com/sites/default/files/sty...,"[praises, bolsonaro, president, beams, brasili...",
97,97,FL-2015-000092-BRA,Container shortage delays shipments of Brazil'...,https://www.reuters.com/article/coffee-transpo...,NEW YORK (Reuters) - Coffee traders are strugg...,"Tue, 13 Oct 2020 07:00:00 GMT",https://static.reuters.com/resources/r/?m=02&d...,"[delays, shipment, coffee, brazil, maersk, cro...",
98,98,FL-2015-000092-BRA,Brazil mining flood could devastate environmen...,https://www.reuters.com/article/us-brazil-damb...,"RIO DOCE, Brazil (Reuters) - The collapse of t...","Sun, 15 Nov 2015 08:00:00 GMT",https://static.reuters.com/resources/r/?m=02&d...,"[water, brazil, flood, environment, river, mud...",
