Skip to content

Commit

Permalink
Issue 43009: Backport dataiterator fixes (#2356)
Browse files Browse the repository at this point in the history
* Backport
#2263
#2268
#2271
* Revert ETL fix
#2222
  • Loading branch information
labkey-martyp committed Jul 5, 2021
1 parent da2b512 commit ab49bf5
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 103 deletions.
147 changes: 70 additions & 77 deletions api/src/org/labkey/api/dataiterator/DataIteratorUtil.java
Expand Up @@ -21,7 +21,6 @@
import org.apache.commons.io.IOUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.junit.Assert;
import org.junit.Test;
Expand Down Expand Up @@ -53,6 +52,7 @@
import java.util.Map;
import java.util.Spliterator;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

Expand Down Expand Up @@ -129,74 +129,89 @@ public static Map<String,ColumnInfo> createTableMap(TableInfo target, boolean us
return targetAliasesMap;
}

enum MatchType {propertyuri, name, alias, jdbcname, tsvColumn}

// rank of a match of import column NAME matching various properties of target column
// MatchType.low is used for matches based on something other than name
enum MatchType {propertyuri, name, alias, jdbcname, tsvColumn, low}


/**
* NOTE: matchColumns() handles multiple source columns matching the same target column (usually a data file problem
* for the user to fix), we don't handle one source column matching multiple target columns (more of an admin design problem).
* One idea would be to return MultiValuedMap<String,Pair<>>, or check for duplicates entries of the same MatchType.
*/
protected static Map<String,Pair<ColumnInfo,MatchType>> _createTableMap(TableInfo target, boolean useImportAliases)
{
List<ColumnInfo> cols = target.getColumns();
/* CONSIDER: move this functionality into a TableInfo method so this map (or maps) can be cached */
List<ColumnInfo> cols = target.getColumns().stream()
.filter(col -> !col.isMvIndicatorColumn() && !col.isRawValueColumn())
.collect(Collectors.toList());

Map<String, Pair<ColumnInfo,MatchType>> targetAliasesMap = new CaseInsensitiveHashMap<>(cols.size()*4);

// should this be under the useImportAliases flag???
for (ColumnInfo col : cols)
{
if (col.isMvIndicatorColumn() || col.isRawValueColumn())
continue;
final String name = col.getName();
targetAliasesMap.put(name, new Pair<>(col,MatchType.name));
String uri = col.getPropertyURI();
if (null != uri)
{
if (!targetAliasesMap.containsKey(uri))
targetAliasesMap.put(uri, new Pair<>(col, MatchType.propertyuri));
String propName = uri.substring(uri.lastIndexOf('#')+1);
if (!targetAliasesMap.containsKey(propName))
targetAliasesMap.put(propName, new Pair<>(col, MatchType.alias));
}
String label = col.getLabel();
if (null != label && !targetAliasesMap.containsKey(label))
targetAliasesMap.put(label, new Pair<>(col, MatchType.alias));
String translatedFieldKey;
if (useImportAliases || "folder".equalsIgnoreCase(name))
// Issue 21015: Dataset snapshot over flow assay dataset doesn't pick up stat column values
// TSVColumnWriter.ColumnHeaderType.queryColumnName format is a FieldKey display value from the column name. Blech.
String tsvQueryColumnName = FieldKey.fromString(col.getName()).toDisplayString();
targetAliasesMap.put(tsvQueryColumnName, new Pair<>(col, MatchType.tsvColumn));
}

// should this be under the useImportAliases flag???
for (ColumnInfo col : cols)
{
// Jdbc resultset names have substitutions for special characters. If this is such a column, need the substituted name to match on
targetAliasesMap.put(col.getJdbcRsName(), new Pair<>(col, MatchType.jdbcname));
}

for (ColumnInfo col : cols)
{
if (useImportAliases || "folder".equalsIgnoreCase(col.getName()))
{
for (String alias : col.getImportAliasSet())
{
if (!targetAliasesMap.containsKey(alias))
targetAliasesMap.put(alias, new Pair<>(col, MatchType.alias));
}
targetAliasesMap.put(alias, new Pair<>(col, MatchType.alias));

// Be sure we have an alias the column name we generate for TSV exports. See issue 21774
translatedFieldKey = FieldKey.fromString(name).toDisplayString();
if (!targetAliasesMap.containsKey(translatedFieldKey))
{
targetAliasesMap.put(translatedFieldKey, new Pair<>(col, MatchType.alias));
}
String translatedFieldKey = FieldKey.fromString(col.getName()).toDisplayString();
targetAliasesMap.put(translatedFieldKey, new Pair<>(col, MatchType.alias));
}
// Jdbc resultset names have substitutions for special characters. If this is such a column, need the substituted name to match on
translatedFieldKey = col.getJdbcRsName();
if (!name.equals(translatedFieldKey))
}

for (ColumnInfo col : cols)
{
String label = col.getLabel();
if (null != label)
targetAliasesMap.put(label, new Pair<>(col, MatchType.alias));
}

for (ColumnInfo col : cols)
{
String uri = col.getPropertyURI();
if (null != uri)
{
targetAliasesMap.put(translatedFieldKey, new Pair<>(col, MatchType.jdbcname));
targetAliasesMap.put(uri, new Pair<>(col, MatchType.propertyuri));
String propName = uri.substring(uri.lastIndexOf('#')+1);
targetAliasesMap.put(propName, new Pair<>(col, MatchType.alias));
}
}

// Issue 21015: Dataset snapshot over flow assay dataset doesn't pick up stat column values
// TSVColumnWriter.ColumnHeaderType.queryColumnName format is a FieldKey display value from the column name. Blech.
String tsvQueryColumnName = FieldKey.fromString(name).toDisplayString();
if (!targetAliasesMap.containsKey(tsvQueryColumnName))
targetAliasesMap.put(tsvQueryColumnName, new Pair<>(col, MatchType.tsvColumn));
for (ColumnInfo col : cols)
{
String name = col.getName();
targetAliasesMap.put(name, new Pair<>(col,MatchType.name));
}
return targetAliasesMap;
}

public static boolean isEtl(@NotNull DataIteratorContext context)
{
return context.getDataSource() != null && context.getDataSource().equals(ETL_DATA_SOURCE);
return targetAliasesMap;
}

/* NOTE doesn't check column mapping collisions */
protected static ArrayList<Pair<ColumnInfo,MatchType>> _matchColumns(DataIterator input, TableInfo target, boolean useImportAliases, boolean isEtl, Container container)
protected static ArrayList<Pair<ColumnInfo,MatchType>> _matchColumns(DataIterator input, TableInfo target, boolean useImportAliases, Container container)
{
Map<String,Pair<ColumnInfo,MatchType>> targetMap = _createTableMap(target, useImportAliases);
ArrayList<Pair<ColumnInfo,MatchType>> matches = new ArrayList<>(input.getColumnCount()+1);
matches.add(null);

// match columns to target columninfos (duplicates StandardDataIteratorBuilder, extract shared method?)
for (int i=1 ; i<=input.getColumnCount() ; i++)
{
ColumnInfo from = input.getColumnInfo(i);
Expand All @@ -205,34 +220,15 @@ protected static ArrayList<Pair<ColumnInfo,MatchType>> _matchColumns(DataIterato
matches.add(null);
continue;
}

Pair<ColumnInfo,MatchType> to = null;
if (isEtl)
{
// Match by name first
to = targetMap.get(from.getName());

// If name matches, check if property URI matches for higher priority match type
if (null != to && null != from.getPropertyURI())
{
// Renamed built-in columns (ex. LSID, ParticipantId) in source queries will not match propertyURI with
// target. In that case, just stick with name match. Otherwise check for propertyURI match for higher priority match
// (this is primarily for ParticipantId which can have two columns with same name in the dataiterator)
if (from.getPropertyURI().equals(to.first.getPropertyURI())) {
Pair<ColumnInfo,MatchType> toUri = targetMap.get(from.getPropertyURI());
if (null != toUri)
to = toUri;
}
}
}
else
Pair<ColumnInfo,MatchType> to = targetMap.get(from.getName());
if (null == to && null != from.getPropertyURI())
{
if (null != from.getPropertyURI())
to = targetMap.get(from.getPropertyURI());
if (null == to)
to = targetMap.get(from.getName());
// Do we actually rely on this anywhere???
// Like maybe ETL from one study to another where subject name does not match? or assay publish?
to = targetMap.get(from.getPropertyURI());
if (null != to)
to = new Pair<>(to.first, org.labkey.api.dataiterator.DataIteratorUtil.MatchType.low);
}

if (null == to)
{
// Check to see if the column i.e. propURI has a property descriptor and vocabulary domain is present
Expand All @@ -253,9 +249,9 @@ protected static ArrayList<Pair<ColumnInfo,MatchType>> _matchColumns(DataIterato


/** throws ValidationException only if there are unresolvable ambiguity in the source->destination column mapping */
public static ArrayList<ColumnInfo> matchColumns(DataIterator input, TableInfo target, boolean useImportAliases, boolean isEtl, ValidationException setupError, @Nullable Container container)
public static ArrayList<ColumnInfo> matchColumns(DataIterator input, TableInfo target, boolean useImportAliases, ValidationException setupError, @Nullable Container container)
{
ArrayList<Pair<ColumnInfo,MatchType>> matches = _matchColumns(input, target, useImportAliases, isEtl, container);
ArrayList<Pair<ColumnInfo,MatchType>> matches = _matchColumns(input, target, useImportAliases, container);
MultiValuedMap<FieldKey,Integer> duplicatesMap = new ArrayListValuedHashMap<>(input.getColumnCount()+1);

for (int i=1 ; i<= input.getColumnCount() ; i++)
Expand Down Expand Up @@ -302,13 +298,10 @@ public static ArrayList<ColumnInfo> matchColumns(DataIterator input, TableInfo t
}




// NOTE: first consider if using QueryUpdateService is better
// this is just a point-to-point copy _without_ triggers
public static int copy(File from, TableInfo to, Container c, User user) throws IOException, BatchValidationException
{

BatchValidationException errors = new BatchValidationException();
DataIteratorContext context = new DataIteratorContext(errors);
DataLoader loader = DataLoaderService.get().createLoader(from, null, true, c, TabLoader.TSV_FILE_TYPE);
Expand Down
Expand Up @@ -39,6 +39,7 @@ public abstract class ExistingRecordDataIterator extends WrapperDataIterator
{
public static final String EXISTING_RECORD_COLUMN_NAME = "_" + ExistingRecordDataIterator.class.getName() + "#EXISTING_RECORD_COLUMN_NAME";

final CachingDataIterator _unwrapped;
final TableInfo target;
final ArrayList<ColumnInfo> pkColumns = new ArrayList<>();
final ArrayList<Supplier<Object>> pkSuppliers = new ArrayList<>();
Expand All @@ -52,6 +53,10 @@ public abstract class ExistingRecordDataIterator extends WrapperDataIterator
ExistingRecordDataIterator(DataIterator in, TableInfo target, @Nullable Set<String> keys, boolean useMark)
{
super(in);

// NOTE it might get wrapped with a LoggingDataIterator, so remember the original DataIterator
this._unwrapped = useMark ? (CachingDataIterator)in : null;

this.target = target;
this.existingColIndex = in.getColumnCount()+1;
this.useMark = useMark;
Expand Down Expand Up @@ -130,7 +135,7 @@ public boolean next() throws BatchValidationException
{
// NOTE: we have to call mark() before we call next() if we want the 'next' row to be cached
if (useMark)
((CachingDataIterator)_delegate).mark();
_unwrapped.mark(); // unwrapped _delegate
boolean ret = super.next();
if (ret && !pkColumns.isEmpty())
prefetchExisting();
Expand Down Expand Up @@ -251,7 +256,7 @@ protected void prefetchExisting() throws BatchValidationException
existingRecords.put(r,(Map<String,Object>)map);
});
// backup to where we started so caller can iterate through them one at a time
((CachingDataIterator)_delegate).reset();
_unwrapped.reset(); // unwrapped _delegate
_delegate.next();
}
}
Expand Down
13 changes: 11 additions & 2 deletions api/src/org/labkey/api/dataiterator/MapDataIterator.java
Expand Up @@ -21,10 +21,12 @@
import org.labkey.api.data.ColumnInfo;
import org.labkey.api.query.BatchValidationException;

import javax.annotation.Nullable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;
import java.util.Set;

/**
* User: matthewb
Expand All @@ -36,7 +38,7 @@ public interface MapDataIterator extends DataIterator
boolean supportsGetMap();
Map<String,Object> getMap();

static class MapDataIteratorImpl implements MapDataIterator, ScrollableDataIterator
class MapDataIteratorImpl implements MapDataIterator, ScrollableDataIterator
{
DataIterator _input;
boolean _mutable;
Expand All @@ -45,6 +47,11 @@ static class MapDataIteratorImpl implements MapDataIterator, ScrollableDataItera
ArrayListMap<String,Object> _currentMap;

MapDataIteratorImpl(DataIterator in, boolean mutable)
{
this(in, mutable, Set.of());
}

public MapDataIteratorImpl(DataIterator in, boolean mutable, Set<String> skip)
{
_input = in;
_mutable = mutable;
Expand All @@ -53,6 +60,8 @@ static class MapDataIteratorImpl implements MapDataIterator, ScrollableDataItera
for (int i=0 ; i<=in.getColumnCount() ; i++)
{
String name = in.getColumnInfo(i).getName();
if (skip.contains(name))
continue;
if (_findMap.containsKey(name))
{
LogManager.getLogger(MapDataIterator.class).warn("Map already has column named '" + name + "'");
Expand Down Expand Up @@ -166,4 +175,4 @@ public void debugLogInfo(StringBuilder sb)
_input.debugLogInfo(sb);
}
}
}
}
Expand Up @@ -195,8 +195,7 @@ public DataIterator getDataIterator(DataIteratorContext context)
// match up the columns, validate that there is no more than one source column that matches the target column
//
ValidationException setupError = new ValidationException();
ArrayList<ColumnInfo> matches = DataIteratorUtil.matchColumns(input, _target, _useImportAliases,
DataIteratorUtil.isEtl(context), setupError, _c);
ArrayList<ColumnInfo> matches = DataIteratorUtil.matchColumns(input, _target, _useImportAliases, setupError, _c);

ArrayList<TranslateHelper> convertTargetCols = new ArrayList<>(input.getColumnCount()+1);
for (int i=1 ; i<=input.getColumnCount() ; i++)
Expand Down

0 comments on commit ab49bf5

Please sign in to comment.