-
Notifications
You must be signed in to change notification settings - Fork 1
/
process_songs.py
138 lines (108 loc) · 3.76 KB
/
process_songs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
import re
from googletrans import Translator
# Fill missing valies by ""
def fill_missing(song):
if "Lyrics" not in song or song["Lyrics"] == "නොදන්නා":
song['Lyrics'] = ""
if "Music" not in song or song["Music"] == "නොදන්නා":
song['Music'] = ""
if "Genre" not in song or song["Genre"] == "නොදන්නා":
song['Genre'] = ""
return song
# seperate title content to sonhala and english and extract sinhala title
def separate_title(song):
title = song["title"]
for i in title:
if i == "–":
sep = "–"
if i == "|":
sep = "|"
if i == "-":
sep = "-"
title_list = title.split(sep)
title_sinhala = title_list[-1].strip()
song['title'] = title_sinhala
print(song['title'])
return song
# clean beat data
def clean_beat(song):
beat = song["beat"]
if type(beat) == type([]):
song['beat'] = beat[0].strip().split(" ", 1)[1]
elif beat == "N/A":
song['beat'] = ""
return song
# replace dots in people names by space
def remove_dots_in_names(song):
names = ["Artist", "Music", "Lyrics"]
for i in names:
field = song[i]
if type(field) == type([]):
for j in range(len(field)):
field[j] = field[j].replace(".", " ")
else:
field = field.replace(".", " ")
song.update({i: field})
return song
# clean lyrics corpus
def clean_lyrics(song):
lyrics = song['song_lyrics']
lines = lyrics.split("\n")
final = []
for i, line in enumerate(lines):
line = line.strip()
line = re.sub('[.!?\\-—]', '', line)
if not line or line.isspace() or '\u200d' in line:
pass
else:
final.append(line)
song["song_lyrics"] = "\n".join(final)
return song
# translate sinhala artist name, composer name, lyricist name and genre to sinhala
def translate(song):
translator = Translator()
fields_to_translate = ["Artist", "Music", "Lyrics", "Genre"]
for i in fields_to_translate:
if type(song[i]) == list:
translated = []
for j in song[i]:
j = j.strip()
translated.append(translator.translate(j, dest='sinhala').text)
else:
song[i] = song[i].strip()
translated = translator.translate(song[i], dest='sinhala').text
song[i] = translated
return song
def process():
for i in range(510):
with open("songs/" + str(i) + ".json") as json_file:
song = json.load(json_file)
# Rename field named Tags to Genre
if "Tags" in song:
song['Genre'] = song["Tags"]
song.pop('Tags')
# Fill the fields that are not found in original data with "නොදනී"
song = fill_missing(song)
# replace dots in names by space
song = remove_dots_in_names(song)
# Separate Sinhala title from title
song = separate_title(song)
# clean beat field
song = clean_beat(song)
# translate relevant fields to Sinhala
song = translate(song)
# convert numbers to int
song['number_of_visits'] = int(song['number_of_visits'].replace(',', ''))
song['number_of_shares'] = int(song['number_of_shares'])
# clean song lyrics
song = clean_lyrics(song)
# process guitar_key
if type(song["guitar_key"]) == type([]):
song["guitar_key"] = song["guitar_key"][-1]
song["guitar_key"] = song["guitar_key"].strip().lower()
# write processed data to processed/ directory
with open('processed/' + str(i) + '.json', 'w') as f:
json.dump(song, f)
if __name__ == "__main__":
process()