-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
134 lines (114 loc) · 4.15 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from bs4 import BeautifulSoup
import requests, time, os
import json, re
def process_content(key_val_pair):
if key_val_pair:
key_val_pair = key_val_pair.get_text()
split_pair = key_val_pair.split(':')
if len(split_pair) > 1:
key = split_pair[0]
val = split_pair[1]
if ',' in val:
values = []
split_val = val.split(',')
for value in split_val:
values.append(value)
return key, values
else:
return key, val
else:
return None, None
else:
return None, None
# parse lyrics
def parse_lyrics(lyrics):
space_set = set([' '])
processed = ''
regex = r"([A-z])+|[0-9]|\||-|∆|([.!?\\\/\(\)\+#&])+"
lyric_lines = lyrics.split('\n')
for line in lyric_lines:
new = re.sub(regex, '', line)
chars = set(new)
if not ((chars == space_set) or (len(chars) is 0)):
processed += new + '\n'
return processed
# parse song html page
def parse_html_song(html_pg):
soup = BeautifulSoup(html_pg, 'html.parser')
song = {}
class_list = ["entry-tags", "entry-categories", "entry-author-name", "lyrics", "music"]
title = soup.find('h1', {"class": "entry-title"}).get_text()
print(title)
guit_key = soup.find_all('h3', {'class': None})[0].get_text().split('|')
print(guit_key)
if guit_key and len(guit_key) == 2:
guitar_key = guit_key[0].split(':')
if len(guitar_key) == 2:
guitar_key = guitar_key[1].strip()
beat = guit_key[1].split(':')
if len(beat) == 2:
beat = beat[1].strip()
elif len(guit_key) == 1:
guitar_key = guit_key[0].strip()
beat = "N/A"
else:
guitar_key = "N/A"
beat = "N/A"
song.update({'guitar_key': guitar_key})
song.update({'beat': beat})
number_of_visits = soup.find('div', {'class': 'tptn_counter'}).get_text().split()[1].split('Visits')[0]
song.update({'number_of_visits': number_of_visits})
shares = soup.find('div', {'class': 'nc_tweetContainer swp_share_button total_shares total_sharesalt'}).get_text().split(" ")[0]
song.update({'number_of_shares': shares})
for class_l in class_list:
content = soup.find_all('span', {"class": class_l})
if content:
key, val = process_content(content[0])
if (not key is None) and (not val is None):
song.update({key: val})
else:
pass
unprocessed_lyrics = soup.select('pre')[0].get_text()
processed_lyrics = parse_lyrics(unprocessed_lyrics)
song.update({'song_lyrics': processed_lyrics})
print(song)
print("=======================================================")
return song
# get links of songs to be scraped
def scrape_song_links():
for page_number in range(5, 17):
url = 'https://sinhalasongbook.com/all-sinhala-song-lyrics-and-chords/?_page={}/'.format(page_number)
print('Scraping the URL : ', url)
headers = requests.utils.default_headers()
response = requests.get(url, headers)
page = response.text
links = []
soup = BeautifulSoup(page, 'html.parser')
song_links = soup.find_all("a", {"class": "_blank"})
for tag in song_links:
link = tag.get('href')
links.append(link)
with open('urls.csv', 'a') as f:
for link in links:
f.write(link + os.linesep)
time.sleep(10)
# scrape song links in the urls.csv
def scrape_songs():
next_song = 0
while next_song < 510:
print('Scraping song', next_song)
with open('urls.csv', 'r') as f:
lines = f.readlines()
url = lines[next_song]
headers = requests.utils.default_headers()
res = requests.get(url, headers)
html_doc = res.text
song = parse_html_song(html_doc)
with open('songs/' + str(next_song) + '.json', 'w') as f:
json.dump(song, f)
next_song += 1
time.sleep(20)
if __name__ == "__main__":
if not os.path.exists('urls.csv'):
scrape_song_links()
scrape_songs()