# Scraping News articles for Floods in Nigeria

## Install Libraries

In [1]:
!pip install pygooglenews
!pip install newspaper3k



## Import libraries

In [2]:
from pygooglenews import GoogleNews
from newspaper import Article
import pandas as pd

## Inputs to Google News

These are the details about flood in **Nigeria** corresponding to the Glide id at: 

`Official WEO(...)/task7(...)/Data/glide_historical_events.csv`

For Brazil, the following IDs where found:
- FL-2020-000207-NGA
- FL-2020-000196-NGA
- FL-2019-000122-NGA
- FL-2019-000093-NGA
- FL-2018-000120-NGA
- FL-2017-000126-NGA

Also available: 
- FL-2015-000155-NGA
- FL-2013-000116-NGA
- FL-2012-000138-NGA
- FF-2011-000088-NGA

We will be searching for articles in English.

In [3]:
language = 'en'
country = 'Nigeria'

search_terms = 'Flood Nigeria'
start_date = '2017-01-01'
end_date = '2021-05-22'

In [4]:
gn = GoogleNews(lang = language, country = country)

In [5]:
# function to get the article content and other details from article url
def get_article_content(article_url):
    # get the date of posting the article
    try:
        news_article = Article(article_url,language='en')
        news_article.download()
        news_article.parse()
        news_article.nlp()
    except:
        pass

    # get the author
    #print(news_article.authors)

    # get the publish date
    #print(news_article.publish_date)

    #get top image
    #print(news_article.top_image)

    #get a summary of article
    #print(news_article.summary)

    # get article keywords
    #print(news_article.keywords)

    #get the article text
    return news_article.text,news_article.top_image,news_article.keywords

In [6]:
# function to get top 50 new links
def get_news_links(search_terms,start_date,end_date,event_id):
    article_num =[]
    article_title = []
    article_link = []
    article_content = []
    publishing_date = []
    article_image = []
    article_keywords =[]
    article_relevance = []
    search = gn.search(query=search_terms, helper = True, when = None, from_ = start_date, to_ = end_date , proxies=None, scraping_bee=None)
    count = 0
    for item in search['entries']:
        print(count)
        article_num.append(count)
        count += 1
        article_title.append(item['title'])
        article_link.append(item['link'])
        article_content.append(get_article_content(item['link'])[0])
        publishing_date.append(item['published'])
        article_image.append(get_article_content(item['link'])[1])
        article_keywords.append(get_article_content(item['link'])[2])
        
    event_id_list = [event_id]*count
    article_relevance_list = [' ']*count
    articles_dict = {'ID': article_num,'event_id': event_id_list,'article_title':article_title,'article_link':article_link,'article_content':article_content,'publishing_date':publishing_date,'article_image':article_image,'article_keywords':article_keywords,'article_relevance':article_relevance_list}
    print(len(article_num))
    print(len(article_title))
    print(len(article_link))
    print(len(article_content))
    print(len(publishing_date))
    print(len(article_image))
    print(len(article_keywords))
    print(len(article_relevance))
    df = pd.DataFrame(articles_dict)
    #news_results_list = zip(article_num,article_title,article_link,article_content,publishing_date,article_image,article_keywords,article_relevance)
    return df

In [7]:
# searching for disaster events in news articles
# event_id is the glide id for which we are scraping the news_articles

ids_list = ['FL-2020-000207-NGA', 'FL-2020-000196-NGA', 'FL-2019-000122-NGA', 
            'FL-2019-000093-NGA', 'FL-2018-000120-NGA', 'FL-2017-000126-NGA']

news_results_list = []
for item in ids_list:
    news_results_list.append(get_news_links(search_terms,start_date,end_date, item))
    
 

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
100
100
100
100
100
100
0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
100
100
100
100
100
100
0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
100
100
100
100
100
100
0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

In [9]:
df_news_results = pd.concat(news_results_list)

df_news_results.to_csv('FL-Nigeria-EN.csv')



In [10]:
df_news_results

Unnamed: 0,ID,event_id,article_title,article_link,article_content,publishing_date,article_image,article_keywords,article_relevance
0,0,FL-2020-000207-NGA,MATTERS ARISING: 28 states at risk of flooding...,https://www.thecable.ng/states-risk-flooding-n...,"Uche Igwe will not forget August 26, 2020 in a...","Thu, 20 May 2021 07:27:00 GMT",https://f5p3e9e4.stackpathcdn.com/wp-content/u...,"[information, water, process, flooding, climat...",
1,1,FL-2020-000207-NGA,"Expect greater floods this year, FG tells Nige...",https://guardian.ng/news/expect-greater-floods...,"Minister of Water Resources, Suleiman Adamu, h...","Fri, 07 May 2021 04:14:00 GMT",https://guardian.ng/wp-content/uploads/2020/06...,"[river, tells, expect, world, nigerians, flood...",
2,2,FL-2020-000207-NGA,10 LGAs prone to flood in Cross River – NEMA -...,https://www.premiumtimesng.com/regional/south-...,ADVERTISEMENT\n\nThe National Emergency Manage...,"Wed, 19 May 2021 18:31:55 GMT",https://i1.wp.com/media.premiumtimesng.com/wp-...,"[river, windstorm, nema, local, flood, ikom, l...",
3,3,FL-2020-000207-NGA,Why Lagosians should heed flood alert | The Gu...,https://guardian.ng/opinion/why-lagosians-shou...,"Sir: On April 14, 2021, the Lagos State Govern...","Mon, 26 Apr 2021 07:00:00 GMT",https://guardian.ng/wp-content/uploads/2020/03...,"[residents, water, heed, world, flood, lagosia...",
4,4,FL-2020-000207-NGA,Rainy season: Fears over impending flood in th...,https://guardian.ng/saturday-magazine/rainy-se...,"In fact, the Commissioner for Environment and ...","Sat, 24 Apr 2021 07:00:00 GMT",https://guardian.ng/wp-content/uploads/2021/04...,"[rainy, residents, water, world, flooding, wat...",
...,...,...,...,...,...,...,...,...,...
95,95,FL-2017-000126-NGA,Nearly 80% of Nigerian farmers affected by flo...,https://www.premiumtimesng.com/agriculture/agr...,About 79 per cent of Nigerian farmers were est...,"Tue, 12 Jan 2021 08:00:00 GMT",https://i0.wp.com/media.premiumtimesng.com/wp-...,"[farmers, flooding, survey, 2020, floods, cent...",
96,96,FL-2017-000126-NGA,"Israel and Hamas ceasefire, Japan approves AZN...",https://money.yahoo.com/video/israel-hamas-cea...,The Guardian\n\nThe comparison rankles support...,"Fri, 21 May 2021 16:09:32 GMT",https://s.yimg.com/ny/api/res/1.2/Fy9KtD59qBfL...,"[japan, south, african, israel, bbc, movement,...",
97,97,FL-2017-000126-NGA,"Climate Change in Nigeria: Floods in Lagos, Ab...",https://qz.com/africa/1054825/climate-change-i...,Earlier this year heavy rains and thunderstorm...,"Thu, 17 Aug 2017 07:00:00 GMT",https://cms.qz.com/wp-content/uploads/2017/08/...,"[started, water, flooding, going, worse, lagos...",
98,98,FL-2017-000126-NGA,Nigeria: Tens of thousands of people stranded ...,https://reliefweb.int/report/nigeria/nigeria-t...,"More than 40,000 men, women and children – mos...","Fri, 15 Nov 2019 08:00:00 GMT",https://reliefweb.int/profiles/reliefweb/theme...,"[river, ocha, tens, thousands, united, town, w...",
