Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exclude spark from core jar #281

Merged
merged 14 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/release_jar.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
docker create -it --name genomicsdb ghcr.io/genomicsdb/genomicsdb:release bash
docker cp genomicsdb:/build/GenomicsDB/build/src/main/libtiledbgenomicsdb.so .
docker cp genomicsdb:/build/GenomicsDB/build/target/genomicsdb-${VERSION_NUMBER}.jar .
docker cp genomicsdb:/build/GenomicsDB/build/target/genomicsdb-${VERSION_NUMBER}-allinone.jar .
docker cp genomicsdb:/build/GenomicsDB/build/target/genomicsdb-${VERSION_NUMBER}-allinone-spark.jar .
docker cp genomicsdb:/build/GenomicsDB/build/target/genomicsdb-${VERSION_NUMBER}-sources.jar .
docker cp genomicsdb:/build/GenomicsDB/build/target/genomicsdb-${VERSION_NUMBER}-javadoc.jar .
docker cp genomicsdb:/build/GenomicsDB/pom.xml genomicsdb-${VERSION_NUMBER}.pom
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/release_publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,17 @@ jobs:
-Dpackaging=jar -DpomFile=genomicsdb-${VERSION_NUMBER}.pom -DrepositoryId=$REPO_ID \
-Djavadoc=genomicsdb-${VERSION_NUMBER}-javadoc.jar \
-Dsources=genomicsdb-${VERSION_NUMBER}-sources.jar \
-Dfiles=genomicsdb-${VERSION_NUMBER}-allinone.jar \
-Dfiles=genomicsdb-${VERSION_NUMBER}-allinone-spark.jar \
-Dtypes=jar \
-Dclassifiers=allinone
-Dclassifiers=allinone-spark
mvn gpg:sign-and-deploy-file -Durl=$URL -Dfile=genomicsdb-${VERSION_NUMBER}.jar \
-DgroupId=org.genomicsdb -DartifactId=genomicsdb -Dversion=${VERSION_NUMBER} \
-Dpackaging=jar -DpomFile=genomicsdb-${VERSION_NUMBER}.pom -DrepositoryId=$REPO_ID \
-Djavadoc=genomicsdb-${VERSION_NUMBER}-javadoc.jar \
-Dsources=genomicsdb-${VERSION_NUMBER}-sources.jar \
-Dfiles=genomicsdb-${VERSION_NUMBER}-allinone.jar \
-Dfiles=genomicsdb-${VERSION_NUMBER}-allinone-spark.jar \
-Dtypes=jar \
-Dclassifiers=allinone
-Dclassifiers=allinone-spark
if [[ ${VERSION_NUMBER} != *SNAPSHOT ]]; then
mvn nexus-staging:rc-list -DnexusUrl=https://oss.sonatype.org/ -DserverId=ossrh -f genomicsdb-${VERSION_NUMBER}.pom
stagingRepoId=$(mvn nexus-staging:rc-list -DnexusUrl=https://oss.sonatype.org/ -DserverId=ossrh | grep orggenomicsdb|cut -f2 -d' ')
Expand Down
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -548,14 +548,14 @@ if(BUILD_JAVA)

#Maven build - depends on dynamic library
add_custom_command(
OUTPUT ${GENOMICSDB_MAVEN_BUILD_DIR}/genomicsdb-${GENOMICSDB_RELEASE_VERSION}.jar ${GENOMICSDB_MAVEN_BUILD_DIR}/genomicsdb-${GENOMICSDB_RELEASE_VERSION}-allinone.jar
OUTPUT ${GENOMICSDB_MAVEN_BUILD_DIR}/genomicsdb-${GENOMICSDB_RELEASE_VERSION}.jar ${GENOMICSDB_MAVEN_BUILD_DIR}/genomicsdb-${GENOMICSDB_RELEASE_VERSION}-allinone-spark.jar
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/pom.xml ${CMAKE_BINARY_DIR}/pom.xml
COMMAND mvn versions:set ${MAVEN_QUIET_ARGS} -DnewVersion=${GENOMICSDB_RELEASE_VERSION} ${MAVEN_PROFILE}
COMMAND mvn package -DskipTests ${MAVEN_ARGS}
DEPENDS tiledbgenomicsdb ${JAVA_SCALA_SOURCES} pom.xml
WORKING_DIRECTORY ${CMAKE_BINARY_DIR})

install(FILES ${GENOMICSDB_MAVEN_BUILD_DIR}/genomicsdb-${GENOMICSDB_RELEASE_VERSION}.jar ${GENOMICSDB_MAVEN_BUILD_DIR}/genomicsdb-${GENOMICSDB_RELEASE_VERSION}-allinone.jar DESTINATION bin)
install(FILES ${GENOMICSDB_MAVEN_BUILD_DIR}/genomicsdb-${GENOMICSDB_RELEASE_VERSION}.jar ${GENOMICSDB_MAVEN_BUILD_DIR}/genomicsdb-${GENOMICSDB_RELEASE_VERSION}-allinone-spark.jar DESTINATION bin)

execute_process(
COMMAND ln -sf ${CMAKE_SOURCE_DIR}/tests ${CMAKE_BINARY_DIR})
Expand All @@ -575,7 +575,7 @@ if(BUILD_JAVA)
add_jar(genomicsdb-${GENOMICSDB_RELEASE_VERSION}-examples
SOURCES ${GENOMICSDB_EXAMPLE_SOURCES}
log4j.properties
INCLUDE_JARS ${GENOMICSDB_MAVEN_BUILD_DIR}/genomicsdb-${GENOMICSDB_RELEASE_VERSION}-allinone.jar
INCLUDE_JARS ${GENOMICSDB_MAVEN_BUILD_DIR}/genomicsdb-${GENOMICSDB_RELEASE_VERSION}-allinone-spark.jar
OUTPUT_DIR ${GENOMICSDB_MAVEN_BUILD_DIR})

#Deploy to Maven central
Expand Down
82 changes: 58 additions & 24 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,13 @@
<groupId>org.apache.spark</groupId>
<artifactId>${spark.core.artifactid}</artifactId>
<version>${spark.version}</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>${spark.sql.artifactid}</artifactId>
<version>${spark.version}</version>
<optional>true</optional>
<exclusions>
<exclusion>
<groupId>com.google.protobuf</groupId>
Expand Down Expand Up @@ -193,29 +195,6 @@
</testExcludes>
</configuration>
</plugin>
<plugin>
<artifactId>maven-antrun-plugin</artifactId>
<version>3.1.0</version>
<executions>
<execution>
<id>unzip-test-artifacts</id>
<phase>process-test-resources</phase>
<configuration>
<target>
<mkdir dir="${genomicsdb_build_directory}/test" />
<copy file="${test_source_directory}/../inputs/test.tgz" tofile="${genomicsdb_build_directory}/test/test.tar.gz" />
<gunzip src="${genomicsdb_build_directory}/test/test.tar.gz" />
<untar src="${genomicsdb_build_directory}/test/test.tar" dest="${genomicsdb_build_directory}/test/"/>
<replace file="${genomicsdb_build_directory}/test/inputs/query.json" token="inputs/" value="${genomicsdb_build_directory}/test/inputs/" />
<replace file="${genomicsdb_build_directory}/test/inputs/loader.json" token="inputs/" value="${genomicsdb_build_directory}/test/inputs/" />
</target>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>

<!-- Jacoco adapted from
https://www.petrikainulainen.net/programming/maven/creating-code-coverage-reports-for-unit-and-integration-tests-with-the-jacoco-maven-plugin/ -->
Expand Down Expand Up @@ -296,6 +275,25 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<id>spark-excluded</id>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<classifier>spark</classifier>
<excludes>
<exclude>org/genomicsdb/spark/**</exclude>
</excludes>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
Expand All @@ -308,7 +306,7 @@
</goals>
<configuration>
<shadedArtifactAttached>true</shadedArtifactAttached>
<shadedClassifierName>allinone</shadedClassifierName>
<shadedClassifierName>allinone-spark</shadedClassifierName>
<filters>
<filter>
<artifact>*:*</artifact>
Expand All @@ -330,6 +328,42 @@
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-antrun-plugin</artifactId>
<version>3.1.0</version>
<executions>
<execution>
<id>unzip-test-artifacts</id>
<phase>process-test-resources</phase>
<configuration>
<target>
<mkdir dir="${genomicsdb_build_directory}/test" />
<copy file="${test_source_directory}/../inputs/test.tgz" tofile="${genomicsdb_build_directory}/test/test.tar.gz" />
<gunzip src="${genomicsdb_build_directory}/test/test.tar.gz" />
<untar src="${genomicsdb_build_directory}/test/test.tar" dest="${genomicsdb_build_directory}/test/"/>
<replace file="${genomicsdb_build_directory}/test/inputs/query.json" token="inputs/" value="${genomicsdb_build_directory}/test/inputs/" />
<replace file="${genomicsdb_build_directory}/test/inputs/loader.json" token="inputs/" value="${genomicsdb_build_directory}/test/inputs/" />
</target>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
<execution>
<id>replace-core-jar</id>
<phase>package</phase>
<configuration>
<target>
<move file="${project.build.directory}/genomicsdb-${genomicsdb.version}-spark.jar"
tofile="${project.build.directory}/genomicsdb-${genomicsdb.version}.jar" />
</target>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,19 @@

/**
* Example Invocation
* spark-submit --class org.genomicsdb.spark.api.GenomicsDBSparkBindings genomicsdb-1.3.1-SNAPSHOT-allinone.jar loader.json querypb.json true
* spark-submit --class org.genomicsdb.spark.api.GenomicsDBSparkBindings genomicsdb-1.3.1-SNAPSHOT-allinone-spark.jar loader.json querypb.json true
* querypb.json should be parseable by GenomicsDBExportConfiguration.ExportConfiguration
* OR
* spark-submit --class org.genomicsdb.spark.api.GenomicsDBSparkBindings genomicsdb-1.3.1-SNAPSHOT-allinone.jar loader.json query.json false
* spark-submit --class org.genomicsdb.spark.api.GenomicsDBSparkBindings genomicsdb-1.3.1-SNAPSHOT-allinone-spark.jar loader.json query.json false
* OR
* spark-submit --class org.genomicsdb.spark.api.GenomicsDBSparkBindings genomicsdb-1.3.1-SNAPSHOT-allinone.jar loader.json query.json
* spark-submit --class org.genomicsdb.spark.api.GenomicsDBSparkBindings genomicsdb-1.3.1-SNAPSHOT-allinone-spark.jar loader.json query.json
*/
public class GenomicsDBSparkBindings {
List<VariantCall> variantCalls;

public static void main(String[] args) throws IOException, ClassNotFoundException {
if (args.length < 2) {
throw new RuntimeException("Usage: spark-submit --class org.genomicsdb.spark.api.GenomicsDBSparkBindings genomicsdb-<VERSION>-allinone.jar <loader.json> <query.json> [<is_serialized_pb>]"+
throw new RuntimeException("Usage: spark-submit --class org.genomicsdb.spark.api.GenomicsDBSparkBindings genomicsdb-<VERSION>-allinone-spark.jar <loader.json> <query.json> [<is_serialized_pb>]"+
"Optional Argument 2 - <is_serialized_pb=True|False, default is false, if is_serialized_pb then query.json is a protobuf serialized file.");
}

Expand Down
4 changes: 2 additions & 2 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __find_genomicsdb_jar(target_dir, jar_file_name):

def setup_classpath(build_dir):
target_dir=os.path.join(build_dir,'target')
allinone_jar=__find_genomicsdb_jar(target_dir,'genomicsdb-*allinone.jar')
allinone_jar=__find_genomicsdb_jar(target_dir,'genomicsdb-*allinone-spark.jar')
examples_jar=__find_genomicsdb_jar(target_dir,'genomicsdb-*examples.jar')
if 'CLASSPATH' in os.environ:
classpath=os.environ['CLASSPATH']
Expand Down Expand Up @@ -94,7 +94,7 @@ def setup_jacoco(build_dir, build_type):
if e.errno != errno.EEXIST:
__error_exit('could not create jacoco-reports dir:'+e.errno+' '+e.filename+' '+e.strerror)
genomicsdb_classes_dir = os.path.join(target_dir, 'jacoco-classes')
allinone_archive = zipfile.ZipFile(__find_genomicsdb_jar(target_dir,'genomicsdb-*allinone.jar'))
allinone_archive = zipfile.ZipFile(__find_genomicsdb_jar(target_dir,'genomicsdb-*allinone-spark.jar'))
for file in allinone_archive.namelist():
if file.startswith('org/genomicsdb'):
allinone_archive.extract(file, genomicsdb_classes_dir)
Expand Down
10 changes: 5 additions & 5 deletions tests/run_spark_hdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def sanity_test_spark_bindings(tmpdir, lib_path, jar_dir, jacoco, genomicsdb_ver
substitute_placeholders(querypb_json, sanity_test_dir)

# Expected exception when run without json files
spark_cmd = 'spark-submit --master '+spark_master+' --deploy-mode '+spark_deploy+' --total-executor-cores 1 --executor-memory 512M --conf "spark.executor.extraJavaOptions='+jacoco+'" --conf "spark.driver.extraJavaOptions='+jacoco+'" --class org.genomicsdb.spark.api.GenomicsDBSparkBindings '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-allinone.jar'
spark_cmd = 'spark-submit --master '+spark_master+' --deploy-mode '+spark_deploy+' --total-executor-cores 1 --executor-memory 512M --conf "spark.executor.extraJavaOptions='+jacoco+'" --conf "spark.driver.extraJavaOptions='+jacoco+'" --class org.genomicsdb.spark.api.GenomicsDBSparkBindings '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-allinone-spark.jar'
pid = subprocess.Popen(spark_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout_string, stderr_string = pid.communicate()
if(pid.returncode == 0):
Expand All @@ -231,7 +231,7 @@ def sanity_test_spark_bindings(tmpdir, lib_path, jar_dir, jacoco, genomicsdb_ver

output_string = "[row=0 col=12140 HG00141 1:12141-12295 {REF=C, DP_FORMAT=2, MIN_DP=0, ALT=[<NON_REF>], GQ=0, PL=[0, 0, 0], GT=0/0}, row=1 col=12144 HG01958 1:12145-12277 {REF=C, DP_FORMAT=3, MIN_DP=0, ALT=[<NON_REF>], GQ=0, PL=[0, 0, 0], GT=0/0}, row=0 col=17384 HG00141 1:17385-17385 {MQRankSum=-0.329000, AD=[58, 22, 17], MQ=31.719999, DP_FORMAT=80, ALT=[A, <NON_REF>], BaseQRankSum=-2.096000, GQ=99, PID=17385_G_A, ReadPosRankSum=0.005000, MQ0=8, GT=0/1, SB=[58, 0, 22, 0], RAW_MQ=5.500000, REF=G, ClippingRankSum=-1.859000, PL=[504, 0, 9807, 678, 1870, 2548], PGT=0|1}, row=1 col=17384 HG01958 1:17385-17385 {MQRankSum=-1.369000, AD=[0, 120, 37], MQ=29.820000, DP_FORMAT=120, ALT=[T, <NON_REF>], BaseQRankSum=-2.074000, GQ=99, PID=17385_G_T, ReadPosRankSum=-0.101000, DP=120, MQ0=3, GT=1/1, SB=[0, 0, 0, 0], RAW_MQ=2.500000, REF=G, ClippingRankSum=0.555000, PL=[3336, 358, 0, 4536, 958, 7349], PGT=0|1}, row=2 col=17384 HG01530 1:17385-17385 {MQRankSum=-0.432000, AD=[40, 36, 0], MQ=59.369999, DP_FORMAT=76, ALT=[A, <NON_REF>], BaseQRankSum=1.046000, GQ=99, ReadPosRankSum=2.055000, DP=76, MQ0=0, GT=0/1, SB=[9, 31, 13, 23], REF=G, ClippingRankSum=-2.242000, PL=[1018, 0, 1116, 1137, 1224, 2361]}]"

spark_cmd = 'spark-submit --master '+spark_master+' --deploy-mode '+spark_deploy+' --total-executor-cores 1 --executor-memory 512M --conf "spark.executor.extraJavaOptions='+jacoco+'" --conf "spark.driver.extraJavaOptions='+jacoco+'" --class org.genomicsdb.spark.api.GenomicsDBSparkBindings '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-allinone.jar '+loader_json+' '+query_json
spark_cmd = 'spark-submit --master '+spark_master+' --deploy-mode '+spark_deploy+' --total-executor-cores 1 --executor-memory 512M --conf "spark.executor.extraJavaOptions='+jacoco+'" --conf "spark.driver.extraJavaOptions='+jacoco+'" --class org.genomicsdb.spark.api.GenomicsDBSparkBindings '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-allinone-spark.jar '+loader_json+' '+query_json
pid = subprocess.Popen(spark_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout_string, stderr_string = pid.communicate()
if(pid.returncode != 0):
Expand All @@ -241,7 +241,7 @@ def sanity_test_spark_bindings(tmpdir, lib_path, jar_dir, jacoco, genomicsdb_ver
sys.stderr.write('Expected output not found in sanity test with query.json\n')
print_error_and_exit(namenode, tmpdir, stdout_string, stderr_string)

spark_cmd = 'spark-submit --master '+spark_master+' --deploy-mode '+spark_deploy+' --total-executor-cores 1 --executor-memory 512M --conf "spark.executor.extraJavaOptions='+jacoco+'" --conf "spark.driver.extraJavaOptions='+jacoco+'" --class org.genomicsdb.spark.api.GenomicsDBSparkBindings '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-allinone.jar '+loader_json+' '+querypb_json+' true'
spark_cmd = 'spark-submit --master '+spark_master+' --deploy-mode '+spark_deploy+' --total-executor-cores 1 --executor-memory 512M --conf "spark.executor.extraJavaOptions='+jacoco+'" --conf "spark.driver.extraJavaOptions='+jacoco+'" --class org.genomicsdb.spark.api.GenomicsDBSparkBindings '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-allinone-spark.jar '+loader_json+' '+querypb_json+' true'
pid = subprocess.Popen(spark_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout_string, stderr_string = pid.communicate()
if(pid.returncode != 0):
Expand Down Expand Up @@ -475,7 +475,7 @@ def main():
with open(query_json_filename, 'w') as fptr:
json.dump(test_query_dict, fptr, indent=4, separators=(',', ': '));
fptr.close();
spark_cmd = 'spark-submit --class TestGenomicsDBSparkHDFS --master '+spark_master+' --deploy-mode '+spark_deploy+' --total-executor-cores 1 --executor-memory 512M --conf "spark.yarn.executor.memoryOverhead=3700" --conf "spark.executor.extraJavaOptions='+jacoco+'" --conf "spark.driver.extraJavaOptions='+jacoco+'" --jars '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-allinone.jar '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-examples.jar --loader '+loader_json_filename+' --query '+query_json_filename+' --template_vcf_header '+template_vcf_header_path+' --spark_master '+spark_master+' --jar_dir '+jar_dir;
spark_cmd = 'spark-submit --class TestGenomicsDBSparkHDFS --master '+spark_master+' --deploy-mode '+spark_deploy+' --total-executor-cores 1 --executor-memory 512M --conf "spark.yarn.executor.memoryOverhead=3700" --conf "spark.executor.extraJavaOptions='+jacoco+'" --conf "spark.driver.extraJavaOptions='+jacoco+'" --jars '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-allinone-spark.jar '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-examples.jar --loader '+loader_json_filename+' --query '+query_json_filename+' --template_vcf_header '+template_vcf_header_path+' --spark_master '+spark_master+' --jar_dir '+jar_dir;
if (test_name == "t6_7_8"):
spark_cmd = spark_cmd + ' --use-query-protobuf';
if (test_name == "t0_1_2_combined"):
Expand Down Expand Up @@ -512,7 +512,7 @@ def main():
vid_path_final=vid_path+query_param_dict['vid_mapping_file'];
else:
vid_path_final=vid_path+"inputs"+os.path.sep+"vid.json";
spark_cmd_v2 = 'spark-submit --class TestGenomicsDBSource --master '+spark_master+' --deploy-mode '+spark_deploy+' --total-executor-cores 1 --executor-memory 512M --conf "spark.yarn.executor.memoryOverhead=3700" --conf "spark.executor.extraJavaOptions='+jacoco+'" --conf "spark.driver.extraJavaOptions='+jacoco+'" --jars '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-allinone.jar '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-examples.jar --loader '+loader_json_filename+' --query '+query_json_filename+' --vid '+vid_path_final+' --spark_master '+spark_master;
spark_cmd_v2 = 'spark-submit --class TestGenomicsDBSource --master '+spark_master+' --deploy-mode '+spark_deploy+' --total-executor-cores 1 --executor-memory 512M --conf "spark.yarn.executor.memoryOverhead=3700" --conf "spark.executor.extraJavaOptions='+jacoco+'" --conf "spark.driver.extraJavaOptions='+jacoco+'" --jars '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-allinone-spark.jar '+jar_dir+'/genomicsdb-'+genomicsdb_version+'-examples.jar --loader '+loader_json_filename+' --query '+query_json_filename+' --vid '+vid_path_final+' --spark_master '+spark_master;
if (gdb_datasource != ""):
spark_cmd_v2 += ' --gdb_datasource=' + gdb_datasource
if (test_name == "t6_7_8"):
Expand Down