From bc1b1ff6e0317e1fcda993aa83ba987a7716abdb Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Tue, 2 Jul 2024 02:11:12 +0300 Subject: [PATCH] - Improve "SPECIFIC_DOMAIN_FILTER" regex: a) Fix false-positives, b) block an old tweeter domain. - Update dependencies. - Code polishing. --- pom.xml | 6 +++--- .../publications_retriever/util/http/ConnSupportUtils.java | 2 +- .../publications_retriever/util/http/HttpConnUtils.java | 2 +- .../publications_retriever/util/url/UrlTypeChecker.java | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index a3906ab..6d0dfca 100644 --- a/pom.xml +++ b/pom.xml @@ -114,14 +114,14 @@ com.google.guava guava - 33.2.0-jre + 33.2.1-jre org.apache.commons commons-compress - 1.26.1 + 1.26.2 @@ -156,7 +156,7 @@ io.minio minio - 8.5.10 + 8.5.11 com.google.guava diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java index 89aef97..8285e3d 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java @@ -88,7 +88,7 @@ public class ConnSupportUtils public static final ConcurrentHashMap domainsWithConnectionData = new ConcurrentHashMap<>(); - public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent". + public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent". public static String acceptLanguage = "en-US,en;q=0.5"; diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java index 6a6b6e1..dad991d 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java @@ -191,7 +191,7 @@ else if ( calledForPageUrl ) { // Visit this url only if this method was called return false; } else if ( (lowerCaseMimeType != null) && ((lowerCaseMimeType.contains("htm") || (lowerCaseMimeType.contains("text") && !lowerCaseMimeType.contains("xml") && !lowerCaseMimeType.contains("csv") && !lowerCaseMimeType.contains("tsv")))) ) // The content-disposition is non-usable in the case of pages.. it's probably not provided anyway. - // TODO - Better make a regex for the above checks.. + // TODO - Better make a regex for the above checks.. (be careful to respect the "||" and "&&" operators) PageCrawler.visit(urlId, sourceUrl, finalUrlStr, mimeType, conn, firstHtmlLine, bufferedReader); else { logger.warn("Non-pageUrl: \"" + finalUrlStr + "\" with mimeType: \"" + mimeType + "\" will not be visited!"); diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java index d597eca..ee15726 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java +++ b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java @@ -53,10 +53,10 @@ public class UrlTypeChecker public static final Pattern INTERNAL_LINKS_FILE_FORMAT_FILTER = Pattern.compile(".+format=(?:xml|" + htOrPhpExtensionsPattern + "|rss|ris|bib).*"); // This exists as a url-parameter. - public static final Pattern SPECIFIC_DOMAIN_FILTER = Pattern.compile("[^/]+://[^/]*(?:(?