From 4012eda63d33f4800f3a3e80a0a43c489d1d8766 Mon Sep 17 00:00:00 2001 From: jamesonuk Date: Thu, 7 Mar 2013 16:42:37 +0000 Subject: [PATCH] Cleaned code so that artist and album strings are handled consistently. --- .../InternetLookups/AllmusicSiteScraper.cs | 133 +++++++++--------- 1 file changed, 66 insertions(+), 67 deletions(-) diff --git a/mediaportal/Databases/Music/InternetLookups/AllmusicSiteScraper.cs b/mediaportal/Databases/Music/InternetLookups/AllmusicSiteScraper.cs index da3cf82ed41..6a18bf8d365 100644 --- a/mediaportal/Databases/Music/InternetLookups/AllmusicSiteScraper.cs +++ b/mediaportal/Databases/Music/InternetLookups/AllmusicSiteScraper.cs @@ -51,7 +51,7 @@ public class AllmusicSiteScraper #region public method - public bool GetArtists(string strArtist, out List artists) + public bool GetArtists(string strArtist, out List artists) { Log.Debug("AllmusicScraper. Searching-Artist: {0}", strArtist); artists = new List(); @@ -65,34 +65,49 @@ public bool GetArtists(string strArtist, out List artists) } var matches = ArtistURLRegEx.Matches(artistSearchHtml); - var strCleanArtist = CleanArtist(strArtist); + var strCleanArtist = CleanString(strArtist); //TODO needs image url in regexp artists.AddRange(from Match m in matches - let strCleanMatch = CleanArtist(m.Groups["artist"].ToString()) + let strCleanMatch = CleanString(m.Groups["artist"].ToString()) let strYearsActive = m.Groups["years"].ToString().Trim() let strArtistUrl = m.Groups["artistURL"].ToString() let strGenre = m.Groups["genres"].ToString() where strCleanArtist == strCleanMatch - where ! string.IsNullOrEmpty(strYearsActive) - select new AllMusicArtistMatch { Artist = strArtist, YearsActive = strYearsActive, ArtistUrl = strArtistUrl, Genre = strGenre}); - - if(artists.Count == 0) + where !string.IsNullOrEmpty(strYearsActive) + select + new AllMusicArtistMatch + { + Artist = strArtist, + YearsActive = strYearsActive, + ArtistUrl = strArtistUrl, + Genre = strGenre + }); + + if (artists.Count == 0) { // still possible that search returned values but none match our artist // try again but this time do not include years active - artists.AddRange(from Match m in matches - let strCleanMatch = CleanArtist(m.Groups["artist"].ToString()) - let strYearsActive = m.Groups["years"].ToString().Trim() - let strArtistUrl = m.Groups["artistURL"].ToString() + artists.AddRange(from Match m in matches + let strCleanMatch = CleanString(m.Groups["artist"].ToString()) + let strYearsActive = m.Groups["years"].ToString().Trim() + let strArtistUrl = m.Groups["artistURL"].ToString() let strGenre = m.Groups["genres"].ToString() - where strCleanArtist == strCleanMatch - select new AllMusicArtistMatch {Artist = strArtist, YearsActive = strYearsActive, ArtistUrl = strArtistUrl, Genre = strGenre }); + where strCleanArtist == strCleanMatch + select + new AllMusicArtistMatch + { + Artist = strArtist, + YearsActive = strYearsActive, + ArtistUrl = strArtistUrl, + Genre = strGenre + }); } - Log.Debug("AllmusicScraper. Searched-Artist: {0} Found: {1} matches", strArtist, artists.Count.ToString(CultureInfo.InvariantCulture)); + Log.Debug("AllmusicScraper. Searched-Artist: {0} Found: {1} matches", strArtist, + artists.Count.ToString(CultureInfo.InvariantCulture)); - return artists.Count != 0; + return artists.Count != 0; } public bool GetArtistHtml(AllMusicArtistMatch allMusicArtistMatch, out string strHTML) @@ -112,8 +127,13 @@ public bool GetAlbumHtml(string strAlbum, string strArtistUrl, out string strHtm strURL = strArtistUrl + "/overview/compilations#discography"; if (!GetAlbumURL(strURL, strAlbum, out strAlbumURL)) { - Log.Debug("AllmusicScraper. Searching-Album: {0} - not found", strAlbum); - return false; + Log.Debug("AllmusicScraper. Searching-Album: {0} - not found in compilaitions. Checking singles & EPs", strAlbum); + strURL = strArtistUrl + "/overview/singles#discography"; + if (!GetAlbumURL(strURL, strAlbum, out strAlbumURL)) + { + Log.Debug("AllmusicScraper. Searching-Album: {0} - not found", strAlbum); + return false; + } } } @@ -133,53 +153,17 @@ private static bool GetAlbumURL(string strArtistURL, string strAlbum, out string return false; } - strAlbum = EncodeString(strAlbum); + var strCleanAlbum = CleanString(strAlbum); - // build up a list of possible alternatives - - // attempt to remove stack endings (eg. disc2, (CD2) etc) - var strStripStackEnding = strAlbum; - Util.Utils.RemoveStackEndings(ref strStripStackEnding); - // try and remove any thing else in brackets at end of album name - // eg. (remastered), (special edition), (vinyl) etc - var strAlbumRemoveBrackets = BracketRegEx.Replace(strAlbum, "$1").Trim(); - // try and repalce all punctuation to try and get a match - // sometimes you have three dots in one format but two in another - var strRemovePunctuation = PunctuationRegex.Replace(strAlbum, "").Trim(); - // replace & and + with "and" - var strAndAlbum = strAlbum.Replace("&", "and").Replace("+", "and"); - - var albumFound = false; for (var m = AlbumURLRegEx.Match(discHTML); m.Success; m = m.NextMatch()) { - var strFoundValue = m.Groups["albumName"].ToString().ToLower(); - strFoundValue = System.Web.HttpUtility.HtmlDecode(strFoundValue); - strFoundValue = EncodeString(strFoundValue); - var strFoundPunctuation = PunctuationRegex.Replace(strFoundValue, ""); - var strFoundAnd = strFoundValue.Replace("&", "and").Replace("+", "and"); + var strFoundValue = CleanString(m.Groups["albumName"].ToString()); - if (strFoundValue == strAlbum) - { - albumFound = true; - } - else if (strFoundValue == strStripStackEnding) - { - albumFound = true; - } - else if (strFoundValue == strAlbumRemoveBrackets) + if (strFoundValue != strCleanAlbum) { - albumFound = true; + continue; } - else if (strFoundPunctuation == strRemovePunctuation) - { - albumFound = true; - } - else if (strAndAlbum == strFoundAnd) - { - albumFound = true; - } - - if (!albumFound) continue; + strAlbumURL = m.Groups["albumURL"].ToString(); break; } @@ -209,19 +193,34 @@ private static string EncodeString(string strUnclean) } /// - /// Improve changes of matching artist by replacing & and + with "and" on both side of comparison + /// Improve changes of matching artists and albums by replacing & and + with "and" on both side of comparison /// Also remove "The" and normalise output to remove accents and finally html decode /// - /// artist we are searching for + /// artist we are searching for /// Cleaned artist string - private static string CleanArtist(string strArtist) + private static string CleanString(string strUncleanString) { - var strCleanArtist = strArtist.ToLower(); - strCleanArtist = strCleanArtist.Replace("&", "and"); - strCleanArtist = strCleanArtist.Replace("+", "and"); - strCleanArtist = Regex.Replace(strCleanArtist, "^the ", "", RegexOptions.IgnoreCase); - strCleanArtist = System.Web.HttpUtility.HtmlDecode(strCleanArtist); - return EncodeString(strCleanArtist); + var strDecodedString = System.Web.HttpUtility.HtmlDecode(strUncleanString); + + var stFormD = strDecodedString.Normalize(NormalizationForm.FormD); + var sb = new StringBuilder(); + + foreach (var t in from t in stFormD let uc = CharUnicodeInfo.GetUnicodeCategory(t) where uc != UnicodeCategory.NonSpacingMark select t) + { + sb.Append(t); + } + + var strCleanString = sb.ToString().Normalize(NormalizationForm.FormC).ToLower(); + strCleanString = strCleanString.Replace("&", "and"); + strCleanString = strCleanString.Replace("+", "and"); + strCleanString = Regex.Replace(strCleanString, "^the ", "", RegexOptions.IgnoreCase); + // attempt to remove stack endings (eg. disc2, (CD2) etc) + Util.Utils.RemoveStackEndings(ref strCleanString); + // try and remove any thing else in brackets at end of string eg. (remastered), (special edition), (vinyl) etc + strCleanString = BracketRegEx.Replace(strCleanString, "$1"); + // try and repalce all punctuation to try and get a match; sometimes you have three dots in one format but two in another + strCleanString = PunctuationRegex.Replace(strCleanString, ""); + return strCleanString.Trim(); } #endregion