Skip to content

Commit

Permalink
Updated scraping code so that artist names are HTML decoded prior to …
Browse files Browse the repository at this point in the history
…matching.

Album names are now normalized to improve matching on album names which contain accents
  • Loading branch information
jamesonuk committed Mar 28, 2013
1 parent d444707 commit 31022e2
Showing 1 changed file with 13 additions and 12 deletions.
25 changes: 13 additions & 12 deletions mediaportal/Databases/Music/InternetLookups/AllmusicSiteScraper.cs
Expand Up @@ -65,11 +65,11 @@ public bool GetArtists(string strArtist, out List<AllMusicArtistMatch> artists)
}

var matches = ArtistURLRegEx.Matches(artistSearchHtml);
var strCleanArtist = EncodeString(CleanArtist(strArtist));
var strCleanArtist = CleanArtist(strArtist);

//TODO needs image url in regexp
artists.AddRange(from Match m in matches
let strCleanMatch = EncodeString(CleanArtist(m.Groups["artist"].ToString()))
let strCleanMatch = CleanArtist(m.Groups["artist"].ToString())
let strYearsActive = m.Groups["years"].ToString().Trim()
let strArtistUrl = m.Groups["artistURL"].ToString()
let strGenre = m.Groups["genres"].ToString()
Expand All @@ -82,7 +82,7 @@ public bool GetArtists(string strArtist, out List<AllMusicArtistMatch> artists)
// still possible that search returned values but none match our artist
// try again but this time do not include years active
artists.AddRange(from Match m in matches
let strCleanMatch = EncodeString(CleanArtist(m.Groups["artist"].ToString()))
let strCleanMatch = CleanArtist(m.Groups["artist"].ToString())
let strYearsActive = m.Groups["years"].ToString().Trim()
let strArtistUrl = m.Groups["artistURL"].ToString()
let strGenre = m.Groups["genres"].ToString()
Expand Down Expand Up @@ -133,7 +133,7 @@ private static bool GetAlbumURL(string strArtistURL, string strAlbum, out string
return false;
}

strAlbum = strAlbum.ToLower();
strAlbum = EncodeString(strAlbum);

// build up a list of possible alternatives

Expand All @@ -154,22 +154,23 @@ private static bool GetAlbumURL(string strArtistURL, string strAlbum, out string
{
var strFoundValue = m.Groups["albumName"].ToString().ToLower();
strFoundValue = System.Web.HttpUtility.HtmlDecode(strFoundValue);
strFoundValue = EncodeString(strFoundValue);
var strFoundPunctuation = PunctuationRegex.Replace(strFoundValue, "");
var strFoundAnd = strFoundValue.Replace("&", "and").Replace("+", "and");

if (strFoundValue == strAlbum.ToLower())
if (strFoundValue == strAlbum)
{
albumFound = true;
}
else if (strFoundValue == strStripStackEnding.ToLower())
else if (strFoundValue == strStripStackEnding)
{
albumFound = true;
}
else if (strFoundValue == strAlbumRemoveBrackets.ToLower())
else if (strFoundValue == strAlbumRemoveBrackets)
{
albumFound = true;
}
else if (strFoundPunctuation == strRemovePunctuation.ToLower())
else if (strFoundPunctuation == strRemovePunctuation)
{
albumFound = true;
}
Expand Down Expand Up @@ -208,9 +209,8 @@ private static string EncodeString(string strUnclean)
}

/// <summary>
/// Improve changes of matching artist by replacing & and + with "and"
/// on both side of comparison
/// Also remove "The"
/// Improve changes of matching artist by replacing & and + with "and" on both side of comparison
/// Also remove "The" and normalise output to remove accents and finally html decode
/// </summary>
/// <param name="strArtist">artist we are searching for</param>
/// <returns>Cleaned artist string</returns>
Expand All @@ -220,7 +220,8 @@ private static string CleanArtist(string strArtist)
strCleanArtist = strCleanArtist.Replace("&", "and");
strCleanArtist = strCleanArtist.Replace("+", "and");
strCleanArtist = Regex.Replace(strCleanArtist, "^the ", "", RegexOptions.IgnoreCase);
return strCleanArtist;
strCleanArtist = System.Web.HttpUtility.HtmlDecode(strCleanArtist);
return EncodeString(strCleanArtist);
}

#endregion
Expand Down

0 comments on commit 31022e2

Please sign in to comment.