Skip to content

Commit

Permalink
- Improve "SPECIFIC_DOMAIN_FILTER" regex: a) Fix false-positives, b) …
Browse files Browse the repository at this point in the history
…block an old tweeter domain.

- Update dependencies.
- Code polishing.
  • Loading branch information
LSmyrnaios committed Jul 1, 2024
1 parent fec27dc commit bc1b1ff
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 7 deletions.
6 changes: 3 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>33.2.0-jre</version>
<version>33.2.1-jre</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-compress -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.26.1</version>
<version>1.26.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.brotli/dec -->
Expand Down Expand Up @@ -156,7 +156,7 @@
<dependency>
<groupId>io.minio</groupId>
<artifactId>minio</artifactId>
<version>8.5.10</version>
<version>8.5.11</version>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public class ConnSupportUtils

public static final ConcurrentHashMap<String, DomainConnectionData> domainsWithConnectionData = new ConcurrentHashMap<>();

public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent".
public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent".
public static String acceptLanguage = "en-US,en;q=0.5";


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ else if ( calledForPageUrl ) { // Visit this url only if this method was called
return false;
}
else if ( (lowerCaseMimeType != null) && ((lowerCaseMimeType.contains("htm") || (lowerCaseMimeType.contains("text") && !lowerCaseMimeType.contains("xml") && !lowerCaseMimeType.contains("csv") && !lowerCaseMimeType.contains("tsv")))) ) // The content-disposition is non-usable in the case of pages.. it's probably not provided anyway.
// TODO - Better make a regex for the above checks..
// TODO - Better make a regex for the above checks.. (be careful to respect the "||" and "&&" operators)
PageCrawler.visit(urlId, sourceUrl, finalUrlStr, mimeType, conn, firstHtmlLine, bufferedReader);
else {
logger.warn("Non-pageUrl: \"" + finalUrlStr + "\" with mimeType: \"" + mimeType + "\" will not be visited!");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ public class UrlTypeChecker

public static final Pattern INTERNAL_LINKS_FILE_FORMAT_FILTER = Pattern.compile(".+format=(?:xml|" + htOrPhpExtensionsPattern + "|rss|ris|bib).*"); // This exists as a url-parameter.

public static final Pattern SPECIFIC_DOMAIN_FILTER = Pattern.compile("[^/]+://[^/]*(?:(?<!drive.)google\\.|goo.gl|gstatic|facebook|fb.me|meta|twitter|x|insta(?:gram|paper)|tiktok|youtube|vimeo|linkedin|ebay|bing|(?:amazon|[./]analytics)\\.|s.w.org|wikipedia|myspace|yahoo|mail|pinterest|reddit|tumblr"
public static final Pattern SPECIFIC_DOMAIN_FILTER = Pattern.compile("[^/]+://[^/]*(?<=[/.])(?:(?<!drive.)google\\.|goo.gl|gstatic|facebook|fb.me|twitter|(?:meta|xing|baidu|t|x|vk).co|insta(?:gram|paper)|tiktok|youtube|vimeo|linkedin|ebay|bing|(?:amazon|[./]analytics)\\.|s.w.org|wikipedia|myspace|yahoo|mail|pinterest|reddit|tumblr"
+ "|www.ccdc.cam.ac.uk|figshare.com/collections/|datadryad.org/stash/dataset/"
+ "|evernote|skype|microsoft|adobe|buffer|digg|stumbleupon|addthis|delicious|dailymotion|gostats|blog(?:ger)?|copyright|friendfeed|newsvine|telegram|getpocket"
+ "|flipboard|line.me|vk|ok.rudouban|baidu|qzone|xing|renren|weibo|doubleclick|bit.ly|github|reviewofbooks|plu.mx"
+ "|flipboard|line.me|ok.rudouban|qzone|renren|weibo|doubleclick|bit.ly|github|reviewofbooks|plu.mx"
+ "|(?<!files.)wordpress|orcid.org"
+ "|auth(?:orize|entication)?\\."

Expand Down

0 comments on commit bc1b1ff

Please sign in to comment.