Skip to content

Commit

Permalink
Created an optimized version of FindOldestDate + StyleCop fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
zmarty committed Sep 17, 2016
1 parent 8febdb5 commit 5906df6
Show file tree
Hide file tree
Showing 7 changed files with 218 additions and 30 deletions.
23 changes: 17 additions & 6 deletions WhoisNormalization.Tests/NormalizationUtilsTests.cs
Expand Up @@ -26,14 +26,25 @@ namespace Microsoft.Geolocation.Whois.Normalization.Tests
public class NormalizationUtilsTests
{
[TestMethod]
public void TestFindOldestDate()
public void TestFindOldestDateOptimized()
{
Assert.IsNull(NormalizationUtils.FindOldestDate(null), "The extracted date should be null because the input text is null");
Assert.IsNull(NormalizationUtils.FindOldestDate(" kenken@sfc.wide.ad.jp 2000021 kenken@sfc.wide.ad.jp 2000210 kenken@sfc.wide.ad.jp 2000-0-10 "), "The extracted date should be null because the input text is invalid");
Assert.IsNull(NormalizationUtils.FindOldestDateOptimized(null), "The extracted date should be null because the input text is null");
Assert.IsNull(NormalizationUtils.FindOldestDateOptimized(" kenken@sfc.wide.ad.jp 2000021 kenken@sfc.wide.ad.jp 2000210 kenken@sfc.wide.ad.jp 2000-0-10 "), "The extracted date should be null because the input text is invalid");

Assert.AreEqual("2000-02-10", NormalizationUtils.FindOldestDate(" kenken@sfc.wide.ad.jp 20000210 "), "The extracted date should be 2000-02-10");
Assert.AreEqual("2005-02-05", NormalizationUtils.FindOldestDate("ripe-dbm@ripe.net 20010724 hostmaster@ripe.net 20011024 hostmaster@ripe.net 20020805 ripe-dbm@ripe.net 20040503 ripe-dbm@ripe.net 20041229 hostmaster@afrinic.net 20050205"), "The extracted date should be 2005-02-05");
Assert.AreEqual("1987-07-08", NormalizationUtils.FindOldestDate("1987-07-08"), "The extracted date should be 1987-07-08");
Assert.AreEqual("2000-02-10", NormalizationUtils.FindOldestDateOptimized(" kenken@sfc.wide.ad.jp 20000210 "), "The extracted date should be 2000-02-10");
Assert.AreEqual("2005-02-05", NormalizationUtils.FindOldestDateOptimized("ripe-dbm@ripe.net 20010724 hostmaster@ripe.net 20011024 hostmaster@ripe.net 20020805 ripe-dbm@ripe.net 20040503 ripe-dbm@ripe.net 20041229 hostmaster@afrinic.net 20050205"), "The extracted date should be 2005-02-05");
Assert.AreEqual("1987-07-08", NormalizationUtils.FindOldestDateOptimized("1987-07-08"), "The extracted date should be 1987-07-08");
}

[TestMethod]
public void TestFindOldestDateSlow()
{
Assert.IsNull(NormalizationUtils.FindOldestDateSlow(null), "The extracted date should be null because the input text is null");
Assert.IsNull(NormalizationUtils.FindOldestDateSlow(" kenken@sfc.wide.ad.jp 2000021 kenken@sfc.wide.ad.jp 2000210 kenken@sfc.wide.ad.jp 2000-0-10 "), "The extracted date should be null because the input text is invalid");

Assert.AreEqual("2000-02-10", NormalizationUtils.FindOldestDateSlow(" kenken@sfc.wide.ad.jp 20000210 "), "The extracted date should be 2000-02-10");
Assert.AreEqual("2005-02-05", NormalizationUtils.FindOldestDateSlow("ripe-dbm@ripe.net 20010724 hostmaster@ripe.net 20011024 hostmaster@ripe.net 20020805 ripe-dbm@ripe.net 20040503 ripe-dbm@ripe.net 20041229 hostmaster@afrinic.net 20050205"), "The extracted date should be 2005-02-05");
Assert.AreEqual("1987-07-08", NormalizationUtils.FindOldestDateSlow("1987-07-08"), "The extracted date should be 1987-07-08");
}
}
}
195 changes: 186 additions & 9 deletions WhoisNormalization/NormalizationUtils.cs
Expand Up @@ -7,14 +7,19 @@
namespace Microsoft.Geolocation.Whois.Normalization
{
using System;
using System.Linq;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using Parsers;
using System.Globalization;

public static class NormalizationUtils
{
private static string[] dateFormats = { "yyyy-MM-dd", "yyyyMMdd" };
private static string dateOutputFormat = "yyyy-MM-dd";
private static char[] dateWordsSplitChars = new char[] { ' ', '\t', '\r', '\n' };
private static CultureInfo dateCultureInfo = new CultureInfo("en-US");

private static HashSet<string> updatedFields = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
"Updated",
Expand Down Expand Up @@ -104,8 +109,8 @@ public static void ExtractCommonRecordMetadata(RawWhoisSection section, string i
{
target.Id = id;
target.Name = FindFirstMatchingFieldValueInRecords(section, nameFieldNames);
target.Created = FindOldestDate(FindFirstMatchingFieldValueInRecords(section, createdFields));
target.Updated = FindOldestDate(FindFirstMatchingFieldValueInRecords(section, updatedFields));
target.Created = FindOldestDateOptimized(FindFirstMatchingFieldValueInRecords(section, createdFields));
target.Updated = FindOldestDateOptimized(FindFirstMatchingFieldValueInRecords(section, updatedFields));
target.UpdatedBy = FindFirstMatchingFieldValueInRecords(section, updatedByFields);
target.Description = FindFirstMatchingFieldValueInRecords(section, descriptionFields);
target.Comment = FindFirstMatchingFieldValueInRecords(section, commentFields);
Expand All @@ -130,26 +135,87 @@ public static void AddToBuilderWithComma(StringBuilder builder, string text)
}
}

public static string FindOldestDate(string text)
public static DateTime? ExtractDateExact(string text)
{
if (text == null)
{
return null;
}

var words = new List<string>(text.Split(new char[] { ' ' }));
var dateNoDash = ExtractDateExactNoDash(text);
var dateDash = ExtractDateExactDash(text);

if (dateNoDash != null)
{
return dateNoDash;
}

if (dateDash != null)
{
return dateDash;
}

return null;
}

public static string FindOldestDateOptimized(string text)
{
if (text == null)
{
return null;
}

var words = new List<string>(text.Split(dateWordsSplitChars));
words = words.Select(word => word.Trim()).ToList<string>();
words = words.Where(word => word.Length > 0).ToList<string>();

DateTime? oldestParsedDate = null;

string[] dateFormats = { "yyyy-dd-MM", "yyyyddMM" };
foreach (var word in words)
{
DateTime? currentParsedDate = ExtractDateExact(word);

if (currentParsedDate != null)
{
if (oldestParsedDate == null)
{
oldestParsedDate = currentParsedDate;
}
else if (oldestParsedDate < currentParsedDate)
{
oldestParsedDate = currentParsedDate;
}
}
}

if (oldestParsedDate.HasValue)
{
return oldestParsedDate.Value.ToString(dateOutputFormat);
}
else
{
return null;
}
}

public static string FindOldestDateSlow(string text)
{
if (text == null)
{
return null;
}

var words = new List<string>(text.Split(dateWordsSplitChars));
words = words.Select(word => word.Trim()).ToList<string>();
words = words.Where(word => word.Length > 0).ToList<string>();

DateTime? oldestParsedDate = null;

foreach (var word in words)
{
DateTime currentParsedDate;

if (DateTime.TryParseExact(s: word, formats: dateFormats, provider: new CultureInfo("en-US"), style: DateTimeStyles.None, result: out currentParsedDate))
if (DateTime.TryParseExact(s: word, formats: dateFormats, provider: dateCultureInfo, style: DateTimeStyles.None, result: out currentParsedDate))
{
if (oldestParsedDate == null)
{
Expand All @@ -164,12 +230,123 @@ public static string FindOldestDate(string text)

if (oldestParsedDate.HasValue)
{
return oldestParsedDate.Value.ToString("yyyy-dd-MM");
return oldestParsedDate.Value.ToString(dateOutputFormat);
}
else
{
return null;
}
}

private static DateTime? ExtractDateExactNoDash(string text)
{
// Example: 20101112
if (text.Length != 8)
{
return null;
}

foreach (var c in text)
{
if (!char.IsNumber(c))
{
return null;
}
}

if (text[0] == '0')
{
return null;
}

var rawYear = text.Substring(0, 4);
var rawMonth = text.Substring(4, 2);
var rawDay = text.Substring(6, 2);

int year;
int month;
int day;

if (!int.TryParse(rawYear, out year))
{
return null;
}

if (!int.TryParse(rawMonth, out month))
{
return null;
}

if (!int.TryParse(rawDay, out day))
{
return null;
}

return new DateTime(year, month, day);
}

private static DateTime? ExtractDateExactDash(string text)
{
// Example: 2010-11-12
if (text.Length != 10)
{
return null;
}

for (var i = 0; i < text.Length; i++)
{
var c = text[i];

switch (i)
{
case 4: // First dash
case 7: // Second dash
if (c != '-')
{
return null;
}

break;
default:

if (!char.IsNumber(c))
{
return null;
}

break;
}
}

if (text[0] == '0')
{
return null;
}

var rawYear = text.Substring(0, 4);
var rawMonth = text.Substring(5, 2);
var rawDay = text.Substring(8, 2);

int year;
int month;
int day;

if (!int.TryParse(rawYear, out year))
{
return null;
}

if (!int.TryParse(rawMonth, out month))
{
return null;
}

if (!int.TryParse(rawDay, out day))
{
return null;
}

return new DateTime(year, month, day);
}
}
}
14 changes: 7 additions & 7 deletions WhoisNormalization/NormalizedLocation.cs
Expand Up @@ -13,13 +13,6 @@ namespace Microsoft.Geolocation.Whois.Normalization

public class NormalizedLocation
{
static NormalizedLocation()
{
allBlacklistedValues = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
allBlacklistedValues.UnionWith(blacklistedValuesSimilarToCountries);
allBlacklistedValues.UnionWith(blacklistedValuesExceptSimilarToCountries);
}

private static HashSet<string> allBlacklistedValues;

private static HashSet<string> blacklistedValuesExceptSimilarToCountries = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
Expand Down Expand Up @@ -156,6 +149,13 @@ static NormalizedLocation()
"Customer Country Code"
};

static NormalizedLocation()
{
allBlacklistedValues = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
allBlacklistedValues.UnionWith(blacklistedValuesSimilarToCountries);
allBlacklistedValues.UnionWith(blacklistedValuesExceptSimilarToCountries);
}

public string Address { get; set; }

public string Street { get; set; }
Expand Down
2 changes: 1 addition & 1 deletion WhoisTsvExport/ApnicTsvWriter.cs
Expand Up @@ -73,7 +73,7 @@ protected new void NetworksWithLocationsToTsv(WhoisParser parser, string inputFi
outputFile.WriteLine(networkTsv);
}
}
// TODO: Else log
//// TODO: Else log
}
}
}
Expand Down
6 changes: 3 additions & 3 deletions WhoisTsvExport/RWhoisTsvWriter.cs
Expand Up @@ -75,7 +75,7 @@ public void NetworksWithLocationsToSeparateTsv(string inputFolderPath, string ou
var networkTsv = network.ToLocationTsv();
outputFile.WriteLine(networkTsv);
}
// TODO: else log
//// TODO: else log
}
}
}
Expand Down Expand Up @@ -104,7 +104,7 @@ public void NetworksWithLocationsToTsv(string inputFolderPath, string outputFile
var networkTsv = network.ToLocationTsv();
outputFile.WriteLine(networkTsv);
}
// TODO: else log
//// TODO: else log
}
}
}
Expand Down Expand Up @@ -165,7 +165,7 @@ public void NetworksLocationPropertyCountsToTsv(string inputFolderPath, string p
stringsCount[value] = currentCount;
}
}
// TODO: else log
//// TODO: else log
}
}

Expand Down
4 changes: 2 additions & 2 deletions WhoisTsvExport/TsvWriter.cs
Expand Up @@ -74,7 +74,7 @@ protected void NetworksWithLocationsToTsv(WhoisParser parser, string inputFilePa
var networkTsv = network.ToLocationTsv();
outputFile.WriteLine(networkTsv);
}
// TODO: Else log
//// TODO: Else log
}
}
}
Expand Down Expand Up @@ -131,7 +131,7 @@ protected void NetworksLocationPropertyCountsToTsv(WhoisParser parser, string in
stringsCount[value] = currentCount;
}
}
// TODO: Else log
//// TODO: Else log
}

using (var outputFile = new StreamWriter(outputFilePath))
Expand Down
4 changes: 2 additions & 2 deletions nuget/WhoisParsers.nuspec
Expand Up @@ -2,7 +2,7 @@
<package >
<metadata>
<id>WhoisParsers</id>
<version>0.1.7</version>
<version>0.1.8</version>
<title>Whois and RWhois Parsers and Crawlers</title>
<authors>Ovidiu Dan</authors>
<owners>Ovidiu Dan</owners>
Expand All @@ -11,7 +11,7 @@
<requireLicenseAcceptance>false</requireLicenseAcceptance>
<summary>Download and parse Whois records from bulk whois database dumps of IANA organizations (ARIN, AFRINIC, APNIC, LACNIC, RIPE ). Crawl and parse RWhois records from RFC 2167 ARIN Referral Whois Servers.</summary>
<description>This allows you to: 1) Download and parse Whois records from bulk whois database dumps of IANA organizations (ARIN, AFRINIC, APNIC, LACNIC, RIPE ) and 2) Crawl and parse RWhois records from RFC 2167 ARIN Referral Whois Servers. It also provides utilities to increment IP addresses and to output Whois databases in TSV format.</description>
<releaseNotes>Added date parsing for Updated and Created fields</releaseNotes>
<releaseNotes>Created an optimized version of FindOldestDate</releaseNotes>
<copyright>Copyright Microsoft</copyright>
<tags>whois rwhois parser parsers crawling arin afrinic apnic lacnic ripe iana bulk database databases rfc 2167 referral servers download unpack decompress tsv</tags>
<dependencies>
Expand Down

0 comments on commit 5906df6

Please sign in to comment.