-
Notifications
You must be signed in to change notification settings - Fork 0
/
Rwanda BBC Webscraping.py
115 lines (92 loc) · 3.74 KB
/
Rwanda BBC Webscraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# import useful libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
website_page = 'https://www.bbc.co.uk/search?q=rwanda'
# Create information holders
name = list()
link = list()
year = list()
categories = list()
channel = list()
page_number = 1
stop_number = 29
# Begin scraping process
while True:
html_text = requests.get(website_page).text
soup = BeautifulSoup(html_text, 'lxml')
news = soup.find('ul', spacing='responsive', role='list')
articles = news.find_all('li')
# start scraping page:
for article in articles:
# add name
article_name = article.find('span').text
name.append(article_name)
# print(article_name)
# add link
article_link = article.find('a')
link.append(article_link['href'])
# print(article_link['href'])
# add categories
article_category = article.find_all('span', class_='ssrcss-8g95ls-MetadataSnippet ecn1o5v2')
for category in article_category:
# ARTICLE WITH 3 CATEGORIES:
if len(article_category) == 3:
# date:
if category == article_category[0]:
dates = category.text
dates = dates.split()
try:
date = int(dates[-1])
year.append(date)
# print(date)
except ValueError:
date = 2021
year.append(date)
# print(date)
# category:
elif category == article_category[1] and article_category[1] != article_category[2]:
categories.append(category.text)
# print(category.text)
# channel:
elif category == article_category[2] and article_category[1] != article_category[2]:
channel.append(category.text)
# print(category.text)
# print('---------------------------------------------------------')
# if channel and category are the same:
elif article_category[1] == article_category[2] and category != article_category[0]:
categories.append(category.text)
channel.append(category.text)
# print(category.text)
# print(category.text)
break
# ARTICLE WITH ONLY 2 CATEGORIES:
else:
if category == article_category[0]:
# date
year.append(2021)
# category:
categories.append(category.text)
# print(category.text)
# channel:
elif category == article_category[1]:
channel.append(category.text)
# print(category.text)
print('finished: ', page_number)
print('website:', website_page, 'finished')
# next page button
buttons = soup.find_all('div', class_='ssrcss-zhhf7y-PageButtonContainer e1b2sq420')
for button in buttons:
if button == buttons[-1]:
page = button.find('a')
website_page = 'https://www.bbc.co.uk' + page['href']
page_number += 1
if page_number > stop_number:
break
print('page stop: ', page_number)
# create csv file:
news_dict = {'Name': name, 'Channel': channel, 'Category': categories, 'Year': year, 'URL': link}
df = pd.DataFrame(news_dict)
print(df)
# save file to local pc
df.to_csv(r'C:\Users\HP\PycharmProjects\pythonProject' + '\\Rwanda_BBC_Data.csv', index=False)