diff --git a/pom.xml b/pom.xml
index a3906ab..6d0dfca 100644
--- a/pom.xml
+++ b/pom.xml
@@ -114,14 +114,14 @@
com.google.guava
guava
- 33.2.0-jre
+ 33.2.1-jre
org.apache.commons
commons-compress
- 1.26.1
+ 1.26.2
@@ -156,7 +156,7 @@
io.minio
minio
- 8.5.10
+ 8.5.11
com.google.guava
diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
index 89aef97..8285e3d 100644
--- a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
+++ b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
@@ -88,7 +88,7 @@ public class ConnSupportUtils
public static final ConcurrentHashMap domainsWithConnectionData = new ConcurrentHashMap<>();
- public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent".
+ public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent".
public static String acceptLanguage = "en-US,en;q=0.5";
diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java
index 6a6b6e1..dad991d 100644
--- a/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java
+++ b/src/main/java/eu/openaire/publications_retriever/util/http/HttpConnUtils.java
@@ -191,7 +191,7 @@ else if ( calledForPageUrl ) { // Visit this url only if this method was called
return false;
}
else if ( (lowerCaseMimeType != null) && ((lowerCaseMimeType.contains("htm") || (lowerCaseMimeType.contains("text") && !lowerCaseMimeType.contains("xml") && !lowerCaseMimeType.contains("csv") && !lowerCaseMimeType.contains("tsv")))) ) // The content-disposition is non-usable in the case of pages.. it's probably not provided anyway.
- // TODO - Better make a regex for the above checks..
+ // TODO - Better make a regex for the above checks.. (be careful to respect the "||" and "&&" operators)
PageCrawler.visit(urlId, sourceUrl, finalUrlStr, mimeType, conn, firstHtmlLine, bufferedReader);
else {
logger.warn("Non-pageUrl: \"" + finalUrlStr + "\" with mimeType: \"" + mimeType + "\" will not be visited!");
diff --git a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java
index d597eca..ee15726 100644
--- a/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java
+++ b/src/main/java/eu/openaire/publications_retriever/util/url/UrlTypeChecker.java
@@ -53,10 +53,10 @@ public class UrlTypeChecker
public static final Pattern INTERNAL_LINKS_FILE_FORMAT_FILTER = Pattern.compile(".+format=(?:xml|" + htOrPhpExtensionsPattern + "|rss|ris|bib).*"); // This exists as a url-parameter.
- public static final Pattern SPECIFIC_DOMAIN_FILTER = Pattern.compile("[^/]+://[^/]*(?:(?