55 changes: 20 additions & 35 deletions mythtv/programs/scripts/metadata/Music/lyrics/lyricscom.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
# -*- Mode: python; coding: utf-8; tab-width: 8; indent-tabs-mode: t; -*-
"""
Scraper for http://www.lyrdb.com/
Scraper for http://www.lyrics.com/
ronie
"""

import sys
import re
from urllib.parse import quote_plus
from urllib.request import urlopen

import socket
import requests
import urllib.parse
import difflib
from bs4 import BeautifulSoup

import sys
from optparse import OptionParser
from common import utilities

__author__ = "Paul Harrison and 'ronie'"
__author__ = "Paul Harrison and ronie"
__title__ = "Lyrics.Com"
__description__ = "Search http://www.lyrics.com for lyrics"
__priority__ = "240"
Expand All @@ -24,67 +24,52 @@

debug = False

socket.setdefaulttimeout(10)

class LyricsFetcher:
def __init__( self ):
self.url = 'http://www.lyrics.com/serp.php?st=%s&qtype=2'

def get_lyrics(self, lyrics):
utilities.log(debug, "%s: searching lyrics for %s - %s - %s" % (__title__, lyrics.artist, lyrics.album, lyrics.title))

try:
from bs4 import BeautifulSoup
except:
utilities.log(True, "Failed to import BeautifulSoup. This grabber requires python-bs4")
return False
sess = requests.Session()

try:
request = urlopen(self.url % quote_plus(lyrics.artist))
response = request.read()
request = sess.get(self.url % urllib.parse.quote_plus(lyrics.artist), timeout=10)
response = request.text
except:
return False

request.close()
soup = BeautifulSoup(response, 'html.parser')
url = ''
for link in soup.find_all('a'):
if link.string and link.get('href').startswith('artist/'):
url = 'http://www.lyrics.com/' + link.get('href')
url = 'https://www.lyrics.com/' + link.get('href')
break
if url:
utilities.log(debug, "%s: Artist url is %s" % (__title__, url))
try:
req = urlopen(url)
resp = req.read().decode('utf-8')
req = sess.get(url, timeout=10)
resp = req.text
except:
return False
req.close()
soup = BeautifulSoup(resp, 'html.parser')
url = ''
for link in soup.find_all('a'):
if link.string and link.get('href').startswith('/lyric/') and (difflib.SequenceMatcher(None, link.string.lower(), lyrics.title.lower()).ratio() > 0.8):
url = 'http://www.lyrics.com' + link.get('href')
if link.string and (difflib.SequenceMatcher(None, link.string.lower(), lyrics.title.lower()).ratio() > 0.8):
url = 'https://www.lyrics.com' + link.get('href')
break

if url:
utilities.log(debug, "%s: Song url is %s" % (__title__, url))

try:
req2 = urlopen(url)
resp2 = req2.read().decode('utf-8')
req2 = sess.get(url, timeout=10)
resp2 = req2.text
except:
return False
req2.close()

matchcode = re.search(u'<pre.*?>(.*?)</pre>', resp2, flags=re.DOTALL)
matchcode = re.search('<pre.*?>(.*?)</pre>', resp2, flags=re.DOTALL)
if matchcode:
lyricscode = (matchcode.group(1))
lyr = re.sub(u'<[^<]+?>', '', lyricscode)
lyr = re.sub('<[^<]+?>', '', lyricscode)
lyrics.lyrics = lyr.replace('\\n','\n')
return True

return False
return False

def performSelfTest():
try:
Expand Down
114 changes: 41 additions & 73 deletions mythtv/programs/scripts/metadata/Music/lyrics/lyricsmode.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,66 @@
#-*- coding: UTF-8 -*-
import sys

try:
from urllib import urlopen, quote_plus
except ImportError:
from urllib.request import urlopen
from urllib.parse import quote_plus

import requests
import urllib.parse
import re

import sys
from optparse import OptionParser
from common import utilities

__author__ = "Paul Harrison and ronie'"
__author__ = "Paul Harrison and ronie"
__title__ = "LyricsMode"
__description__ = "Search http://www.lyricsmode.com for lyrics"
__priority__ = "170"
__priority__ = "240"
__version__ = "0.1"
__syncronized__ = False

debug = False

class LyricsFetcher:
def __init__( self ):
self.clean_lyrics_regex = re.compile( "<.+?>" )
self.normalize_lyrics_regex = re.compile( "&#[x]*(?P<name>[0-9]+);*" )
self.clean_br_regex = re.compile( r"<br[ /]*>[\s]*", re.IGNORECASE )
self.search_results_regex = re.compile("<a href=\"[^\"]+\">([^<]+)</a></td>[^<]+<td><a href=\"([^\"]+)\" class=\"b\">[^<]+</a></td>", re.IGNORECASE)
self.next_results_regex = re.compile("<A href=\"([^\"]+)\" class=\"pages\">next .</A>", re.IGNORECASE)
return

def get_lyrics(self, lyrics):
utilities.log(debug, "%s: searching lyrics for %s - %s - %s" % (__title__, lyrics.artist, lyrics.album, lyrics.title))

artist = utilities.deAccent(lyrics.artist)
title = utilities.deAccent(lyrics.title)
try: # below is borowed from XBMC Lyrics
url = "http://www.lyricsmode.com/lyrics/%s/%s/%s.html" % (artist.lower()[:1], artist.lower().replace("&","and").replace(" ","_"), title.lower().replace("&","and").replace(" ","_"))
lyrics_found = False
while True:
utilities.log(debug, "%s: search url: %s" % (__title__, url))
song_search = urlopen(url).read()
if song_search.find("<p id=\"lyrics_text\" class=\"ui-annotatable\">") >= 0:
break

if lyrics_found:
# if we're here, we found the lyrics page but it didn't
# contains the lyrics part (licensing issue or some bug)
return False

# Let's try to use the research box if we didn't yet
if not 'search' in url:
url = "http://www.lyricsmode.com/search.php?what=songs&s=" + quote_plus(title.lower())
else:
# the search gave more than on result, let's try to find our song
url = ""
start = song_search.find('<!--output-->')
end = song_search.find('<!--/output-->', start)
results = self.search_results_regex.findall(song_search, start, end)

for result in results:
if result[0].lower() in artist.lower():
url = "http://www.lyricsmode.com" + result[1]
lyrics_found = True
break

if not url:
# Is there a next page of results ?
match = self.next_results_regex.search(song_search[end:])
if match:
url = "http://www.lyricsmode.com/search.php" + match.group(1)
else:
return False

lyr = song_search.split("<p id=\"lyrics_text\" class=\"ui-annotatable\">")[1].split('</p><p id=\"lyrics_text_selected\">')[0]
lyr = self.clean_br_regex.sub( "\n", lyr ).strip()
lyr = self.clean_lyrics_regex.sub( "", lyr ).strip()
lyr = self.normalize_lyrics_regex.sub( lambda m: unichr( int( m.group( 1 ) ) ), lyr.decode("ISO-8859-1") )
lir = []
for line in lyr.splitlines():
line.strip()
if line.find("Lyrics from:") < 0:
lir.append(line)
lyr = u"\n".join( lir )
if lyr.startswith('These lyrics are missing'):
return False
lyrics.lyrics = lyr
url = 'http://www.lyricsmode.com/lyrics/%s/%s/%s.html' % (artist.lower()[:1], artist.lower().replace('&','and').replace(' ','_'), title.lower().replace('&','and').replace(' ','_'))
result = self.direct_url(url)
if not result:
result = self.search_url(artist, title)
if result:
lyr = result.split('style="position: relative;">')[1].split('<div')[0]
lyrics.lyrics = lyr.replace('<br />', '')
return True

def direct_url(self, url):
try:
utilities.log(True, '%s: direct url: %s' % (__title__, url))
song_search = requests.get(url, timeout=10)
response = song_search.text
if response.find('lyrics_text') >= 0:
return response
except:
utilities.log(True, "%s: %s::%s (%d) [%s]" % (
__title__, self.__class__.__name__,
sys.exc_info()[ 2 ].tb_frame.f_code.co_name,
sys.exc_info()[ 2 ].tb_lineno,
sys.exc_info()[ 1 ]
))
return False
utilities.log(True, 'error in direct url')

def search_url(self, artist, title):
try:
url = 'http://www.lyricsmode.com/search.php?search=' + urllib.parse.quote_plus(artist.lower() + ' ' + title.lower())
utilities.log(True, '%s: search url: %s' % (__title__, url))
song_search = requests.get(url, timeout=10)
response = song_search.text
matchcode = re.search('lm-list__cell-title">.*?<a href="(.*?)" class="lm-link lm-link--primary', response, flags=re.DOTALL)
try:
url = 'http://www.lyricsmode.com' + (matchcode.group(1))
result = self.direct_url(url)
if result:
return result
except:
return
except:
utilities.log(True, 'error in search url')


def performSelfTest():
found = False
Expand Down