From 911446260db2707ff8d8e1dff3a24cd75957a084 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 9 Apr 2025 18:23:28 -0400 Subject: [PATCH 1/3] potential memory leaks --- .../iq/dataverse/export/DDIExporter.java | 16 +- .../dataverse/export/ddi/DdiExportUtil.java | 81 +++++--- .../dublincore/DublinCoreExportUtil.java | 56 +++--- .../export/openaire/OpenAireExportUtil.java | 29 ++- .../dataverse/ingest/IngestServiceBean.java | 178 +++++++++--------- 5 files changed, 213 insertions(+), 147 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/export/DDIExporter.java b/src/main/java/edu/harvard/iq/dataverse/export/DDIExporter.java index d48ce3a537d..d44aa8da0ab 100644 --- a/src/main/java/edu/harvard/iq/dataverse/export/DDIExporter.java +++ b/src/main/java/edu/harvard/iq/dataverse/export/DDIExporter.java @@ -15,6 +15,9 @@ import jakarta.json.JsonObject; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamWriter; + +import org.apache.solr.common.util.IOUtils; + import javax.xml.stream.XMLOutputFactory; /** @@ -44,14 +47,25 @@ public String getDisplayName(Locale locale) { @Override public void exportDataset(ExportDataProvider dataProvider, OutputStream outputStream) throws ExportException { + XMLStreamWriter xmlw = null; + //XMLStreamWriter is not auto-closable - can't use try-with-resources here try { - XMLStreamWriter xmlw = XMLOutputFactory.newInstance().createXMLStreamWriter(outputStream); + xmlw = XMLOutputFactory.newInstance().createXMLStreamWriter(outputStream); xmlw.writeStartDocument(); xmlw.flush(); DdiExportUtil.datasetJson2ddi(dataProvider.getDatasetJson(), dataProvider.getDatasetFileDetails(), outputStream); } catch (XMLStreamException xse) { throw new ExportException("Caught XMLStreamException performing DDI export", xse); + } finally { + if (xmlw != null) { + try { + xmlw.close(); + } catch (XMLStreamException e) { + // Log this exception, but don't rethrow as it's not the primary issue + e.printStackTrace(); + } + } } } diff --git a/src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java b/src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java index 1a02089aef9..b6747cd9f3e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java @@ -117,19 +117,31 @@ private static String dto2ddi(DatasetDTO datasetDto) throws XMLStreamException { } private static void dtoddi(DatasetDTO datasetDto, OutputStream outputStream) throws XMLStreamException { - XMLStreamWriter xmlw = XMLOutputFactory.newInstance().createXMLStreamWriter(outputStream); - xmlw.writeStartElement("codeBook"); - xmlw.writeDefaultNamespace("ddi:codebook:2_5"); - xmlw.writeAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); - xmlw.writeAttribute("xsi:schemaLocation", DDIExporter.DEFAULT_XML_NAMESPACE + " " + DDIExporter.DEFAULT_XML_SCHEMALOCATION); - xmlw.writeAttribute("version", DDIExporter.DEFAULT_XML_VERSION); - if(DvObjectContainer.isMetadataLanguageSet(datasetDto.getMetadataLanguage())) { - xmlw.writeAttribute("xml:lang", datasetDto.getMetadataLanguage()); - } - createStdyDscr(xmlw, datasetDto); - createOtherMats(xmlw, datasetDto.getDatasetVersion().getFiles()); - xmlw.writeEndElement(); // codeBook - xmlw.flush(); + XMLStreamWriter xmlw = null; + try { + xmlw = XMLOutputFactory.newInstance().createXMLStreamWriter(outputStream); + xmlw.writeStartElement("codeBook"); + xmlw.writeDefaultNamespace("ddi:codebook:2_5"); + xmlw.writeAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); + xmlw.writeAttribute("xsi:schemaLocation", DDIExporter.DEFAULT_XML_NAMESPACE + " " + DDIExporter.DEFAULT_XML_SCHEMALOCATION); + xmlw.writeAttribute("version", DDIExporter.DEFAULT_XML_VERSION); + if (DvObjectContainer.isMetadataLanguageSet(datasetDto.getMetadataLanguage())) { + xmlw.writeAttribute("xml:lang", datasetDto.getMetadataLanguage()); + } + createStdyDscr(xmlw, datasetDto); + createOtherMats(xmlw, datasetDto.getDatasetVersion().getFiles()); + xmlw.writeEndElement(); // codeBook + xmlw.flush(); + } finally { + if (xmlw != null) { + try { + xmlw.close(); + } catch (XMLStreamException e) { + // Log this exception, but don't rethrow as it's in finally block + logger.log(Level.WARNING, "Error closing XMLStreamWriter", e); + } + } + } } @@ -139,21 +151,34 @@ public static void datasetJson2ddi(JsonObject datasetDtoAsJson, JsonArray fileDe Gson gson = new Gson(); DatasetDTO datasetDto = gson.fromJson(datasetDtoAsJson.toString(), DatasetDTO.class); - XMLStreamWriter xmlw = XMLOutputFactory.newInstance().createXMLStreamWriter(outputStream); - xmlw.writeStartElement("codeBook"); - xmlw.writeDefaultNamespace("ddi:codebook:2_5"); - xmlw.writeAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); - xmlw.writeAttribute("xsi:schemaLocation", DDIExporter.DEFAULT_XML_NAMESPACE + " " + DDIExporter.DEFAULT_XML_SCHEMALOCATION); - xmlw.writeAttribute("version", DDIExporter.DEFAULT_XML_VERSION); - if(DvObjectContainer.isMetadataLanguageSet(datasetDto.getMetadataLanguage())) { - xmlw.writeAttribute("xml:lang", datasetDto.getMetadataLanguage()); - } - createStdyDscr(xmlw, datasetDto); - createFileDscr(xmlw, fileDetails); - createDataDscr(xmlw, fileDetails); - createOtherMatsFromFileMetadatas(xmlw, fileDetails); - xmlw.writeEndElement(); // codeBook - xmlw.flush(); + XMLStreamWriter xmlw = null; + try { + xmlw = XMLOutputFactory.newInstance().createXMLStreamWriter(outputStream); + + xmlw.writeStartElement("codeBook"); + xmlw.writeDefaultNamespace("ddi:codebook:2_5"); + xmlw.writeAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); + xmlw.writeAttribute("xsi:schemaLocation", DDIExporter.DEFAULT_XML_NAMESPACE + " " + DDIExporter.DEFAULT_XML_SCHEMALOCATION); + xmlw.writeAttribute("version", DDIExporter.DEFAULT_XML_VERSION); + if (DvObjectContainer.isMetadataLanguageSet(datasetDto.getMetadataLanguage())) { + xmlw.writeAttribute("xml:lang", datasetDto.getMetadataLanguage()); + } + createStdyDscr(xmlw, datasetDto); + createFileDscr(xmlw, fileDetails); + createDataDscr(xmlw, fileDetails); + createOtherMatsFromFileMetadatas(xmlw, fileDetails); + xmlw.writeEndElement(); // codeBook + xmlw.flush(); + } finally { + if (xmlw != null) { + try { + xmlw.close(); + } catch (XMLStreamException e) { + // Log this exception, but don't rethrow as it's in finally block + logger.log(Level.WARNING, "Error closing XMLStreamWriter", e); + } + } + } } /** diff --git a/src/main/java/edu/harvard/iq/dataverse/export/dublincore/DublinCoreExportUtil.java b/src/main/java/edu/harvard/iq/dataverse/export/dublincore/DublinCoreExportUtil.java index d201801bc45..2bd8b3ca8d4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/export/dublincore/DublinCoreExportUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/export/dublincore/DublinCoreExportUtil.java @@ -24,6 +24,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.logging.Level; import java.util.logging.Logger; import jakarta.json.JsonObject; import javax.xml.stream.XMLOutputFactory; @@ -68,29 +69,40 @@ public static void datasetJson2dublincore(JsonObject datasetDtoAsJson, OutputStr } private static void dto2dublincore(DatasetDTO datasetDto, OutputStream outputStream, String dcFlavor) throws XMLStreamException { - XMLStreamWriter xmlw = XMLOutputFactory.newInstance().createXMLStreamWriter(outputStream); - if (DC_FLAVOR_DCTERMS.equals(dcFlavor)) { - xmlw.writeStartDocument(); - xmlw.writeStartElement("metadata"); - xmlw.writeAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); - xmlw.writeAttribute("xmlns:dc", DC_XML_NAMESPACE); - xmlw.writeAttribute("xmlns:dcterms", DCTERMS_XML_NAMESPACE); - xmlw.writeDefaultNamespace(DCTERMS_DEFAULT_NAMESPACE); - //xmlw.writeAttribute("xsi:schemaLocation", DCTERMS_DEFAULT_NAMESPACE+" "+DCTERMS_XML_SCHEMALOCATION); - createDC(xmlw, datasetDto, dcFlavor); - } else if (DC_FLAVOR_OAI.equals(dcFlavor)) { - xmlw.writeStartElement("oai_dc:dc"); - xmlw.writeAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); - xmlw.writeAttribute("xmlns:oai_dc", OAI_DC_XML_NAMESPACE); - xmlw.writeAttribute("xmlns:dc", DC_XML_NAMESPACE); - xmlw.writeAttribute("xsi:schemaLocation", OAI_DC_XML_NAMESPACE+" "+OAI_DC_XML_SCHEMALOCATION); - //writeAttribute(xmlw, "version", DEFAULT_XML_VERSION); - createOAIDC(xmlw, datasetDto, dcFlavor); + XMLStreamWriter xmlw = null; + try { + xmlw = XMLOutputFactory.newInstance().createXMLStreamWriter(outputStream); + if (DC_FLAVOR_DCTERMS.equals(dcFlavor)) { + xmlw.writeStartDocument(); + xmlw.writeStartElement("metadata"); + xmlw.writeAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); + xmlw.writeAttribute("xmlns:dc", DC_XML_NAMESPACE); + xmlw.writeAttribute("xmlns:dcterms", DCTERMS_XML_NAMESPACE); + xmlw.writeDefaultNamespace(DCTERMS_DEFAULT_NAMESPACE); + // xmlw.writeAttribute("xsi:schemaLocation", DCTERMS_DEFAULT_NAMESPACE+" "+DCTERMS_XML_SCHEMALOCATION); + createDC(xmlw, datasetDto, dcFlavor); + } else if (DC_FLAVOR_OAI.equals(dcFlavor)) { + xmlw.writeStartElement("oai_dc:dc"); + xmlw.writeAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); + xmlw.writeAttribute("xmlns:oai_dc", OAI_DC_XML_NAMESPACE); + xmlw.writeAttribute("xmlns:dc", DC_XML_NAMESPACE); + xmlw.writeAttribute("xsi:schemaLocation", OAI_DC_XML_NAMESPACE + " " + OAI_DC_XML_SCHEMALOCATION); + // writeAttribute(xmlw, "version", DEFAULT_XML_VERSION); + createOAIDC(xmlw, datasetDto, dcFlavor); + } + + xmlw.writeEndElement(); // or + xmlw.flush(); + } finally { + if (xmlw != null) { + try { + xmlw.close(); + } catch (XMLStreamException e) { + // Log this exception, but don't rethrow as it's in finally block + logger.log(Level.WARNING, "Error closing XMLStreamWriter", e); + } + } } - - - xmlw.writeEndElement(); // or - xmlw.flush(); } //UPDATED by rmo-cdsp: diff --git a/src/main/java/edu/harvard/iq/dataverse/export/openaire/OpenAireExportUtil.java b/src/main/java/edu/harvard/iq/dataverse/export/openaire/OpenAireExportUtil.java index a2ff980ca28..166ea30461e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/export/openaire/OpenAireExportUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/export/openaire/OpenAireExportUtil.java @@ -2,6 +2,7 @@ import java.io.OutputStream; import java.util.*; +import java.util.logging.Level; import java.util.logging.Logger; import jakarta.json.JsonObject; @@ -49,19 +50,31 @@ public static void datasetJson2openaire(JsonObject datasetDtoAsJson, OutputStrea } private static void dto2openaire(DatasetDTO datasetDto, OutputStream outputStream) throws XMLStreamException { - XMLStreamWriter xmlw = XMLOutputFactory.newInstance().createXMLStreamWriter(outputStream); + XMLStreamWriter xmlw = null; + try { + xmlw = XMLOutputFactory.newInstance().createXMLStreamWriter(outputStream); - xmlw.writeStartElement("resource"); // + xmlw.writeStartElement("resource"); // - xmlw.writeAttribute("xmlns:xsi", XSI_NAMESPACE); - xmlw.writeAttribute("xmlns", RESOURCE_NAMESPACE); - xmlw.writeAttribute("xsi:schemaLocation", RESOURCE_NAMESPACE + " " + RESOURCE_SCHEMA_LOCATION); + xmlw.writeAttribute("xmlns:xsi", XSI_NAMESPACE); + xmlw.writeAttribute("xmlns", RESOURCE_NAMESPACE); + xmlw.writeAttribute("xsi:schemaLocation", RESOURCE_NAMESPACE + " " + RESOURCE_SCHEMA_LOCATION); - createOpenAire(xmlw, datasetDto); + createOpenAire(xmlw, datasetDto); - xmlw.writeEndElement(); // + xmlw.writeEndElement(); // - xmlw.flush(); + xmlw.flush(); + } finally { + if (xmlw != null) { + try { + xmlw.close(); + } catch (XMLStreamException e) { + // Log this exception, but don't rethrow as it's in finally block + logger.log(Level.WARNING, "Error closing XMLStreamWriter", e); + } + } + } } private static void createOpenAire(XMLStreamWriter xmlw, DatasetDTO datasetDto) throws XMLStreamException { diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index 71c498a4d0b..a79c8f559a4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -744,33 +744,34 @@ public void produceContinuousSummaryStatistics(DataFile dataFile, File generated for (int i = 0; i < dataFile.getDataTable().getVarQuantity(); i++) { if (dataFile.getDataTable().getDataVariables().get(i).isIntervalContinuous()) { logger.fine("subsetting continuous vector"); - - if ("float".equals(dataFile.getDataTable().getDataVariables().get(i).getFormat())) { - Float[] variableVector = TabularSubsetGenerator.subsetFloatVector( - new FileInputStream(generatedTabularFile), - i, - dataFile.getDataTable().getCaseQuantity().intValue(), - dataFile.getDataTable().isStoredWithVariableHeader()); - logger.fine("Calculating summary statistics on a Float vector;"); - calculateContinuousSummaryStatistics(dataFile, i, variableVector); - // calculate the UNF while we are at it: - logger.fine("Calculating UNF on a Float vector;"); - calculateUNF(dataFile, i, variableVector); - variableVector = null; - } else { - Double[] variableVector = TabularSubsetGenerator.subsetDoubleVector( - new FileInputStream(generatedTabularFile), - i, - dataFile.getDataTable().getCaseQuantity().intValue(), - dataFile.getDataTable().isStoredWithVariableHeader()); - logger.fine("Calculating summary statistics on a Double vector;"); - calculateContinuousSummaryStatistics(dataFile, i, variableVector); - // calculate the UNF while we are at it: - logger.fine("Calculating UNF on a Double vector;"); - calculateUNF(dataFile, i, variableVector); - variableVector = null; + try (InputStream in = new FileInputStream(generatedTabularFile)) { + if ("float".equals(dataFile.getDataTable().getDataVariables().get(i).getFormat())) { + Float[] variableVector = TabularSubsetGenerator.subsetFloatVector( + in, + i, + dataFile.getDataTable().getCaseQuantity().intValue(), + dataFile.getDataTable().isStoredWithVariableHeader()); + logger.fine("Calculating summary statistics on a Float vector;"); + calculateContinuousSummaryStatistics(dataFile, i, variableVector); + // calculate the UNF while we are at it: + logger.fine("Calculating UNF on a Float vector;"); + calculateUNF(dataFile, i, variableVector); + variableVector = null; + } else { + Double[] variableVector = TabularSubsetGenerator.subsetDoubleVector( + in, + i, + dataFile.getDataTable().getCaseQuantity().intValue(), + dataFile.getDataTable().isStoredWithVariableHeader()); + logger.fine("Calculating summary statistics on a Double vector;"); + calculateContinuousSummaryStatistics(dataFile, i, variableVector); + // calculate the UNF while we are at it: + logger.fine("Calculating UNF on a Double vector;"); + calculateUNF(dataFile, i, variableVector); + variableVector = null; + } + logger.fine("Done! (continuous);"); } - logger.fine("Done! (continuous);"); } } } @@ -783,21 +784,22 @@ public void produceDiscreteNumericSummaryStatistics(DataFile dataFile, File gene if (dataFile.getDataTable().getDataVariables().get(i).isIntervalDiscrete() && dataFile.getDataTable().getDataVariables().get(i).isTypeNumeric()) { logger.fine("subsetting discrete-numeric vector"); - - Long[] variableVector = TabularSubsetGenerator.subsetLongVector( - new FileInputStream(generatedTabularFile), - i, - dataFile.getDataTable().getCaseQuantity().intValue(), - dataFile.getDataTable().isStoredWithVariableHeader()); - // We are discussing calculating the same summary stats for - // all numerics (the same kind of sumstats that we've been calculating - // for numeric continuous type) -- L.A. Jul. 2014 - calculateContinuousSummaryStatistics(dataFile, i, variableVector); - // calculate the UNF while we are at it: - logger.fine("Calculating UNF on a Long vector"); - calculateUNF(dataFile, i, variableVector); - logger.fine("Done! (discrete numeric)"); - variableVector = null; + try (InputStream in = new FileInputStream(generatedTabularFile)) { + Long[] variableVector = TabularSubsetGenerator.subsetLongVector( + in, + i, + dataFile.getDataTable().getCaseQuantity().intValue(), + dataFile.getDataTable().isStoredWithVariableHeader()); + // We are discussing calculating the same summary stats for + // all numerics (the same kind of sumstats that we've been calculating + // for numeric continuous type) -- L.A. Jul. 2014 + calculateContinuousSummaryStatistics(dataFile, i, variableVector); + // calculate the UNF while we are at it: + logger.fine("Calculating UNF on a Long vector"); + calculateUNF(dataFile, i, variableVector); + logger.fine("Done! (discrete numeric)"); + variableVector = null; + } } } } @@ -822,17 +824,19 @@ public void produceCharacterSummaryStatistics(DataFile dataFile, File generatedT if (dataFile.getDataTable().getDataVariables().get(i).isTypeCharacter()) { logger.fine("subsetting character vector"); - String[] variableVector = TabularSubsetGenerator.subsetStringVector( - new FileInputStream(generatedTabularFile), - i, - dataFile.getDataTable().getCaseQuantity().intValue(), - dataFile.getDataTable().isStoredWithVariableHeader()); - //calculateCharacterSummaryStatistics(dataFile, i, variableVector); - // calculate the UNF while we are at it: - logger.fine("Calculating UNF on a String vector"); - calculateUNF(dataFile, i, variableVector); - logger.fine("Done! (character)"); - variableVector = null; + try (InputStream in = new FileInputStream(generatedTabularFile)) { + String[] variableVector = TabularSubsetGenerator.subsetStringVector( + in, + i, + dataFile.getDataTable().getCaseQuantity().intValue(), + dataFile.getDataTable().isStoredWithVariableHeader()); + // calculateCharacterSummaryStatistics(dataFile, i, variableVector); + // calculate the UNF while we are at it: + logger.fine("Calculating UNF on a String vector"); + calculateUNF(dataFile, i, variableVector); + logger.fine("Done! (character)"); + variableVector = null; + } } } } @@ -854,19 +858,20 @@ public static void produceFrequencies(File generatedTabularFile, List 0) { - if (isNumeric) { - variableVector = TabularSubsetGenerator.subsetFloatVector( - new FileInputStream(generatedTabularFile), - i, - caseQuantity, - skipVariableHeaderLine); - } - else { - variableVector = TabularSubsetGenerator.subsetStringVector( - new FileInputStream(generatedTabularFile), - i, - caseQuantity, - skipVariableHeaderLine); + try (InputStream in = new FileInputStream(generatedTabularFile)) { + if (isNumeric) { + variableVector = TabularSubsetGenerator.subsetFloatVector( + in, + i, + caseQuantity, + skipVariableHeaderLine); + } else { + variableVector = TabularSubsetGenerator.subsetStringVector( + in, + i, + caseQuantity, + skipVariableHeaderLine); + } } if (variableVector != null) { Hashtable freq = calculateFrequency(variableVector); @@ -1231,24 +1236,6 @@ public boolean ingestAsTabular(Long datafile_id) { return ingestSuccessful; } - private BufferedInputStream openFile(DataFile dataFile) throws IOException { - BufferedInputStream inputStream; - StorageIO storageIO = dataFile.getStorageIO(); - storageIO.open(); - if (storageIO.isLocalFile()) { - inputStream = new BufferedInputStream(storageIO.getInputStream()); - } else { - File tempFile = File.createTempFile("tempIngestSourceFile", ".tmp"); - try (ReadableByteChannel dataFileChannel = storageIO.getReadChannel(); - FileChannel tempIngestSourceChannel = new FileOutputStream(tempFile).getChannel();) { - tempIngestSourceChannel.transferFrom(dataFileChannel, 0, storageIO.getSize()); - } - inputStream = new BufferedInputStream(new FileInputStream(tempFile)); - logger.fine("Saved "+storageIO.getSize()+" bytes in a local temp file."); - } - return inputStream; - } - private void restoreIngestedDataFile(DataFile dataFile, TabularDataIngest tabDataIngest, long originalSize, String originalFileName, String originalContentType) { dataFile.setDataTables(null); if (tabDataIngest != null && tabDataIngest.getDataTable() != null) { @@ -1421,15 +1408,16 @@ public boolean extractMetadataFromNetcdf(String tempFileLocation, DataFile dataF logger.fine("tempFileLocation is null. Perhaps the file is alrady on disk or S3 direct upload is enabled."); File tempFile = null; File localFile; - StorageIO storageIO; + StorageIO storageIO = null; try { storageIO = dataFile.getStorageIO(); - storageIO.open(); + if (storageIO.isLocalFile()) { localFile = storageIO.getFileSystemPath().toFile(); dataFileLocation = localFile.getAbsolutePath(); logger.fine("extractMetadataFromNetcdf: file is local. Path: " + dataFileLocation); } else { + storageIO.open(); Optional allow = JvmSettings.GEO_EXTRACT_S3_DIRECT_UPLOAD.lookupOptional(Boolean.class); if (!(allow.isPresent() && allow.get())) { logger.fine("extractMetadataFromNetcdf: skipping because of config is set to not slow down S3 remote upload."); @@ -1446,6 +1434,10 @@ public boolean extractMetadataFromNetcdf(String tempFileLocation, DataFile dataF } catch (IOException ex) { logger.info("extractMetadataFromNetcdf, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); return false; + } finally { + if(storageIO!= null) { + storageIO.closeInputStream(); + } } } @@ -1564,15 +1556,16 @@ private String getExistingFile(DataFile dataFile, String dataFileLocation) { // This file is already on S3 (non direct upload) or local storage. File tempFile = null; File localFile; - StorageIO storageIO; + StorageIO storageIO = null; try { storageIO = dataFile.getStorageIO(); - storageIO.open(); + if (storageIO.isLocalFile()) { localFile = storageIO.getFileSystemPath().toFile(); dataFileLocation = localFile.getAbsolutePath(); logger.fine("getExistingFile: file is local. Path: " + dataFileLocation); } else { + storageIO.open(); // Need to create a temporary local file: tempFile = File.createTempFile("tempFileExtractMetadataNcml", ".tmp"); try ( ReadableByteChannel targetFileChannel = (ReadableByteChannel) storageIO.getReadChannel(); FileChannel tempFileChannel = new FileOutputStream(tempFile).getChannel();) { @@ -1583,7 +1576,12 @@ private String getExistingFile(DataFile dataFile, String dataFileLocation) { } } catch (IOException ex) { logger.fine("getExistingFile: While attempting to extract NcML, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); + } finally { + if(storageIO!= null) { + storageIO.closeInputStream(); + } } + return dataFileLocation; } @@ -2161,7 +2159,7 @@ private void fixMissingOriginalType(long fileId) { // swift and similar implementations, we'll read the saved aux // channel and save it as a local temp file. - StorageIO storageIO; + StorageIO storageIO = null; File savedOriginalFile = null; boolean tempFileRequired = false; @@ -2196,6 +2194,10 @@ private void fixMissingOriginalType(long fileId) { } catch (Exception ex) { logger.warning("Exception "+ex.getClass()+" caught trying to open StorageIO channel for the saved original; (datafile id=" + fileId + ", datatable id=" + datatableId + "): " + ex.getMessage()); savedOriginalFile = null; + } finally { + if (storageIO!= null) { + storageIO.closeInputStream(); + } } if (savedOriginalFile == null) { From 74b88138f64af658d4736d2a7a8ff75c46dc4740 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 9 Apr 2025 18:46:01 -0400 Subject: [PATCH 2/3] incremental reads --- .../dataaccess/TabularSubsetGenerator.java | 314 +++++++++--------- 1 file changed, 150 insertions(+), 164 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java index a42bb35615f..4ccdfb57af6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/TabularSubsetGenerator.java @@ -20,6 +20,7 @@ package edu.harvard.iq.dataverse.dataaccess; +import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; @@ -27,6 +28,7 @@ import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.Scanner; @@ -120,54 +122,55 @@ public void subsetFile(InputStream in, String outfile, List columns, Lo public static Double[] subsetDoubleVector(InputStream in, int column, int numCases, boolean skipHeader) { Double[] retVector = new Double[numCases]; - try (Scanner scanner = new Scanner(in)) { - scanner.useDelimiter("\\n"); - + try (BufferedReader reader = new BufferedReader(new InputStreamReader(in))) { if (skipHeader) { - skipFirstLine(scanner); + reader.readLine(); // Skip the header line } - for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { - if (scanner.hasNext()) { - String[] line = (scanner.next()).split("\t", -1); - - // Verified: new Double("nan") works correctly, - // resulting in Double.NaN; - // Double("[+-]Inf") doesn't work however; - // (the constructor appears to be expecting it - // to be spelled as "Infinity", "-Infinity", etc. - if ("inf".equalsIgnoreCase(line[column]) || "+inf".equalsIgnoreCase(line[column])) { - retVector[caseIndex] = java.lang.Double.POSITIVE_INFINITY; - } else if ("-inf".equalsIgnoreCase(line[column])) { - retVector[caseIndex] = java.lang.Double.NEGATIVE_INFINITY; - } else if (line[column] == null || line[column].equals("")) { - // missing value: - retVector[caseIndex] = null; - } else { - try { - retVector[caseIndex] = new Double(line[column]); - } catch (NumberFormatException ex) { - retVector[caseIndex] = null; // missing value - } - } - + String line; + int caseIndex = 0; + while ((line = reader.readLine()) != null && caseIndex < numCases) { + String[] fields = line.split("\t", -1); + if (fields.length > column) { + retVector[caseIndex] = parseDoubleValue(fields[column]); } else { - throw new RuntimeException("Tab file has fewer rows than the stored number of cases!"); + throw new RuntimeException("Column index out of bounds"); } + caseIndex++; } - - int tailIndex = numCases; - while (scanner.hasNext()) { - String nextLine = scanner.next(); - if (!"".equals(nextLine)) { - throw new RuntimeException("Column " + column + ": tab file has more nonempty rows than the stored number of cases (" + numCases + ")! current index: " + tailIndex + ", line: " + nextLine); + + if (caseIndex < numCases) { + throw new RuntimeException("Tab file has fewer rows than the stored number of cases!"); + } + + // Check for extra non-empty lines + while ((line = reader.readLine()) != null) { + if (!line.trim().isEmpty()) { + throw new RuntimeException("Tab file has more nonempty rows than the stored number of cases (" + numCases + ")!"); } - tailIndex++; } - + } catch (IOException e) { + throw new RuntimeException("Error reading from input stream", e); } return retVector; - + } + + private static Double parseDoubleValue(String value) { + if (value == null || value.isEmpty()) { + return null; // missing value + } + value = value.toLowerCase(); + if ("inf".equals(value) || "+inf".equals(value)) { + return Double.POSITIVE_INFINITY; + } else if ("-inf".equals(value)) { + return Double.NEGATIVE_INFINITY; + } else { + try { + return Double.parseDouble(value); + } catch (NumberFormatException ex) { + return null; // missing value + } + } } /* @@ -176,52 +179,55 @@ public static Double[] subsetDoubleVector(InputStream in, int column, int numCas */ public static Float[] subsetFloatVector(InputStream in, int column, int numCases, boolean skipHeader) { Float[] retVector = new Float[numCases]; - try (Scanner scanner = new Scanner(in)) { - scanner.useDelimiter("\\n"); - + try (BufferedReader reader = new BufferedReader(new InputStreamReader(in))) { if (skipHeader) { - skipFirstLine(scanner); + reader.readLine(); // Skip the header line } - for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { - if (scanner.hasNext()) { - String[] line = (scanner.next()).split("\t", -1); - // Verified: new Float("nan") works correctly, - // resulting in Float.NaN; - // Float("[+-]Inf") doesn't work however; - // (the constructor appears to be expecting it - // to be spelled as "Infinity", "-Infinity", etc. - if ("inf".equalsIgnoreCase(line[column]) || "+inf".equalsIgnoreCase(line[column])) { - retVector[caseIndex] = java.lang.Float.POSITIVE_INFINITY; - } else if ("-inf".equalsIgnoreCase(line[column])) { - retVector[caseIndex] = java.lang.Float.NEGATIVE_INFINITY; - } else if (line[column] == null || line[column].equals("")) { - // missing value: - retVector[caseIndex] = null; - } else { - try { - retVector[caseIndex] = new Float(line[column]); - } catch (NumberFormatException ex) { - retVector[caseIndex] = null; // missing value - } - } + String line; + int caseIndex = 0; + while ((line = reader.readLine()) != null && caseIndex < numCases) { + String[] fields = line.split("\t", -1); + if (fields.length > column) { + retVector[caseIndex] = parseFloatValue(fields[column]); } else { - throw new RuntimeException("Tab file has fewer rows than the stored number of cases!"); + throw new RuntimeException("Column index out of bounds"); } + caseIndex++; } - - int tailIndex = numCases; - while (scanner.hasNext()) { - String nextLine = scanner.next(); - if (!"".equals(nextLine)) { - throw new RuntimeException("Column "+column+": tab file has more nonempty rows than the stored number of cases ("+numCases+")! current index: "+tailIndex+", line: "+nextLine); + + if (caseIndex < numCases) { + throw new RuntimeException("Tab file has fewer rows than the stored number of cases!"); + } + + // Check for extra non-empty lines + while ((line = reader.readLine()) != null) { + if (!line.trim().isEmpty()) { + throw new RuntimeException("Tab file has more nonempty rows than the stored number of cases (" + numCases + ")!"); } - tailIndex++; } - + } catch (IOException e) { + throw new RuntimeException("Error reading from input stream", e); } return retVector; - + } + + private static Float parseFloatValue(String value) { + if (value == null || value.isEmpty()) { + return null; // missing value + } + value = value.toLowerCase(); + if ("inf".equals(value) || "+inf".equals(value)) { + return Float.POSITIVE_INFINITY; + } else if ("-inf".equals(value)) { + return Float.NEGATIVE_INFINITY; + } else { + try { + return Float.parseFloat(value); + } catch (NumberFormatException ex) { + return null; // missing value + } + } } /* @@ -230,38 +236,48 @@ public static Float[] subsetFloatVector(InputStream in, int column, int numCases */ public static Long[] subsetLongVector(InputStream in, int column, int numCases, boolean skipHeader) { Long[] retVector = new Long[numCases]; - try (Scanner scanner = new Scanner(in)) { - scanner.useDelimiter("\\n"); - + try (BufferedReader reader = new BufferedReader(new InputStreamReader(in))) { if (skipHeader) { - skipFirstLine(scanner); + reader.readLine(); // Skip the header line } - for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { - if (scanner.hasNext()) { - String[] line = (scanner.next()).split("\t", -1); - try { - retVector[caseIndex] = new Long(line[column]); - } catch (NumberFormatException ex) { - retVector[caseIndex] = null; // assume missing value - } + String line; + int caseIndex = 0; + while ((line = reader.readLine()) != null && caseIndex < numCases) { + String[] fields = line.split("\t", -1); + if (fields.length > column) { + retVector[caseIndex] = parseLongValue(fields[column]); } else { - throw new RuntimeException("Tab file has fewer rows than the stored number of cases!"); + throw new RuntimeException("Column index out of bounds"); } + caseIndex++; } - - int tailIndex = numCases; - while (scanner.hasNext()) { - String nextLine = scanner.next(); - if (!"".equals(nextLine)) { - throw new RuntimeException("Column "+column+": tab file has more nonempty rows than the stored number of cases ("+numCases+")! current index: "+tailIndex+", line: "+nextLine); + + if (caseIndex < numCases) { + throw new RuntimeException("Tab file has fewer rows than the stored number of cases!"); + } + + // Check for extra non-empty lines + while ((line = reader.readLine()) != null) { + if (!line.trim().isEmpty()) { + throw new RuntimeException("Tab file has more nonempty rows than the stored number of cases (" + numCases + ")!"); } - tailIndex++; } - + } catch (IOException e) { + throw new RuntimeException("Error reading from input stream", e); } return retVector; - + } + + private static Long parseLongValue(String value) { + if (value == null || value.isEmpty()) { + return null; // missing value + } + try { + return Long.parseLong(value); + } catch (NumberFormatException ex) { + return null; // missing value + } } /* @@ -270,84 +286,54 @@ public static Long[] subsetLongVector(InputStream in, int column, int numCases, */ public static String[] subsetStringVector(InputStream in, int column, int numCases, boolean skipHeader) { String[] retVector = new String[numCases]; - try (Scanner scanner = new Scanner(in)) { - scanner.useDelimiter("\\n"); - + try (BufferedReader reader = new BufferedReader(new InputStreamReader(in))) { if (skipHeader) { - skipFirstLine(scanner); + reader.readLine(); // Skip the header line } - for (int caseIndex = 0; caseIndex < numCases; caseIndex++) { - if (scanner.hasNext()) { - String[] line = (scanner.next()).split("\t", -1); - retVector[caseIndex] = line[column]; - - if ("".equals(line[column])) { - // An empty string is a string missing value! - // An empty string in quotes is an empty string! - retVector[caseIndex] = null; - } else { - // Strip the outer quotes: - line[column] = line[column].replaceFirst("^\\\"", ""); - line[column] = line[column].replaceFirst("\\\"$", ""); - - // We need to restore the special characters that - // are stored in tab files escaped - quotes, new lines - // and tabs. Before we do that however, we need to - // take care of any escaped backslashes stored in - // the tab file. I.e., "foo\t" should be transformed - // to "foo"; but "foo\\t" should be transformed - // to "foo\t". This way new lines and tabs that were - // already escaped in the original data are not - // going to be transformed to unescaped tab and - // new line characters! - String[] splitTokens = line[column].split(Matcher.quoteReplacement("\\\\"), -2); - - // (note that it's important to use the 2-argument version - // of String.split(), and set the limit argument to a - // negative value; otherwise any trailing backslashes - // are lost.) - for (int i = 0; i < splitTokens.length; i++) { - splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\\""), "\""); - splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\t"), "\t"); - splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\n"), "\n"); - splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\r"), "\r"); - } - // TODO: - // Make (some of?) the above optional; for ex., we - // do need to restore the newlines when calculating UNFs; - // But if we are subsetting these vectors in order to - // create a new tab-delimited file, they will - // actually break things! -- L.A. Jul. 28 2014 - - line[column] = StringUtils.join(splitTokens, '\\'); - - retVector[caseIndex] = line[column]; - } - + String line; + int caseIndex = 0; + while ((line = reader.readLine()) != null && caseIndex < numCases) { + String[] fields = line.split("\t", -1); + if (fields.length > column) { + retVector[caseIndex] = parseStringValue(fields[column]); } else { - throw new RuntimeException("Tab file has fewer rows than the stored number of cases!"); + throw new RuntimeException("Column index out of bounds"); } + caseIndex++; } - - int tailIndex = numCases; - while (scanner.hasNext()) { - String nextLine = scanner.next(); - if (!"".equals(nextLine)) { - throw new RuntimeException("Column "+column+": tab file has more nonempty rows than the stored number of cases ("+numCases+")! current index: "+tailIndex+", line: "+nextLine); + + if (caseIndex < numCases) { + throw new RuntimeException("Tab file has fewer rows than the stored number of cases!"); + } + + // Check for extra non-empty lines + while ((line = reader.readLine()) != null) { + if (!line.trim().isEmpty()) { + throw new RuntimeException("Tab file has more nonempty rows than the stored number of cases (" + numCases + ")!"); } - tailIndex++; } - + } catch (IOException e) { + throw new RuntimeException("Error reading from input stream", e); } return retVector; - } - - private static void skipFirstLine(Scanner scanner) { - if (!scanner.hasNext()) { - throw new RuntimeException("Failed to read the variable name header line from the tab-delimited file!"); + + private static String parseStringValue(String value) { + if (value.isEmpty() || "".equals(value)) { + return null; // An empty string is a string missing value + } + // Strip the outer quotes: + value = value.replaceFirst("^\\\"", "").replaceFirst("\\\"$", ""); + + // Unescape special characters + String[] splitTokens = value.split(Matcher.quoteReplacement("\\\\"), -2); + for (int i = 0; i < splitTokens.length; i++) { + splitTokens[i] = splitTokens[i].replaceAll(Matcher.quoteReplacement("\\\""), "\"") + .replaceAll(Matcher.quoteReplacement("\\t"), "\t") + .replaceAll(Matcher.quoteReplacement("\\n"), "\n") + .replaceAll(Matcher.quoteReplacement("\\r"), "\r"); } - scanner.next(); - } + return StringUtils.join(splitTokens, '\\'); + } } \ No newline at end of file From 248be67a263bd7ba4daa9b4148b77c6b5b17a474 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 17 Apr 2025 18:38:13 -0400 Subject: [PATCH 3/3] release note --- doc/release-notes/11417-export-improvements.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 doc/release-notes/11417-export-improvements.md diff --git a/doc/release-notes/11417-export-improvements.md b/doc/release-notes/11417-export-improvements.md new file mode 100644 index 00000000000..c020a5d6848 --- /dev/null +++ b/doc/release-notes/11417-export-improvements.md @@ -0,0 +1,3 @@ +### Export Improvements + +Memory usage has been reduced and potential memory leaks closed in the metadata exporters \ No newline at end of file