diff --git a/openomics/database/ontology.py b/openomics/database/ontology.py index 584abf14..e1a0407c 100644 --- a/openomics/database/ontology.py +++ b/openomics/database/ontology.py @@ -13,7 +13,7 @@ class Ontology(Dataset): def __init__(self, path, file_resources=None, col_rename=None, npartitions=0, verbose=False): """ - Manages dataset input processing from tables and construct an ontology network from obo file. There ontology + Manages dataset input processing from tables and construct an ontology network from .obo file. There ontology network is G(V,E) where there exists e_ij for child i to parent j to present "node i is_a node j". Args: diff --git a/openomics/utils/GTF.py b/openomics/utils/GTF.py deleted file mode 100644 index a1ee5ac4..00000000 --- a/openomics/utils/GTF.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python -""" -GTF.py -Kamil Slowikowski -December 24, 2013 - -Read GFF/GTF files. Works with gzip compressed files and pandas. - - http://useast.ensembl.org/info/website/upload/gff.html - -LICENSE - -This is free and unencumbered software released into the public domain. -Anyone is free to copy, modify, publish, use, compile, sell, or -distribute this software, either in source code form or as a compiled -binary, for any purpose, commercial or non-commercial, and by any -means. - -In jurisdictions that recognize copyright laws, the author or authors -of this software dedicate any and all copyright interest in the -software to the public domain. We make this dedication for the benefit -of the public at large and to the detriment of our heirs and -successors. We intend this dedication to be an overt act of -relinquishment in perpetuity of all present and future rights to this -software under copyright law. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. - -For more information, please refer to -""" - - -import gzip -import re -from collections import defaultdict - -import dask.dataframe as dd -import pandas as pd - -GTF_HEADER = ['seqname', 'source', 'feature', 'start', 'end', 'score', - 'strand', 'frame'] -R_SEMICOLON = re.compile(r'\s*;\s*') -R_COMMA = re.compile(r'\s*,\s*') -R_KEYVALUE = re.compile(r'(\s+|\s*=\s*)') - - -def dataframe(filename, npartitions): - """Open an optionally gzipped GTF file and return a pandas.DataFrame. - - Args: - filename: - npartitions: - """ - # Each column is a list stored as a value in this dict. - result = defaultdict(list) - - for i, line in enumerate(lines(filename)): - for key in line.keys(): - # This key has not been seen yet, so set it to None for all - # previous lines. - if key not in result: - result[key] = [None] * i - - # Ensure this row has some value for each column. - for key in result.keys(): - result[key].append(line.get(key, None)) - - df = pd.DataFrame(result) - if npartitions: - return dd.from_pandas(df) - else: - return df - - -def lines(filename): - """Open an optionally gzipped GTF file and generate a dict for each line. - - Args: - filename: - """ - fn_open = gzip.open if filename.endswith('.gz') else open - - with fn_open(filename) as fh: - for line in fh: - if line.startswith('#'): - continue - else: - yield parse(line) - - -def parse(line): - """Parse a single GTF line and return a dict. - - Args: - line: - """ - result = {} - - fields = line.rstrip().split('\t') - - for i, col in enumerate(GTF_HEADER): - result[col] = _get_value(fields[i]) - - # INFO field consists of "key1=value;key2=value;...". - infos = [x for x in re.split(R_SEMICOLON, fields[8]) if x.strip()] - - for i, info in enumerate(infos, 1): - # It should be key="value". - try: - key, _, value = re.split(R_KEYVALUE, info, 1) - # But sometimes it is just "value". - except ValueError: - key = 'INFO{}'.format(i) - value = info - # Ignore the field if there is no value. - if value: - result[key] = _get_value(value) - - return result - - -def _get_value(value): - """ - Args: - value: - """ - if not value: - return None - - # Strip double and single quotes. - value = value.strip('"\'') - - # Return a list if the value has a comma. - if ',' in value: - value = re.split(R_COMMA, value) - # These values are equivalent to None. - elif value in ['', '.', 'NA']: - return None - - return value diff --git a/openomics/utils/read_gtf.py b/openomics/utils/read_gtf.py index 83ecbc29..69854505 100644 --- a/openomics/utils/read_gtf.py +++ b/openomics/utils/read_gtf.py @@ -16,7 +16,6 @@ from collections import OrderedDict from os.path import exists -import dask import dask.dataframe as dd import numpy as np import pandas as pd @@ -292,16 +291,16 @@ def parse_gtf_and_expand_attributes(filepath_or_buffer, npartitions=None, compre features (set or None): Ignore entries which don't correspond to one of the supplied features """ if npartitions: - ddf = parse_gtf_dask(filepath_or_buffer, npartitions=npartitions, compression=compression, features=features) - ddf = ddf.reset_index(drop=False) - ddf = ddf.set_index("index") + df = parse_gtf_dask(filepath_or_buffer, npartitions=npartitions, compression=compression, features=features) + df = df.reset_index(drop=False) + df = df.set_index("index") - attribute_values = ddf.pop("attribute") + attribute_values = df.pop("attribute") for column_name, values in expand_attribute_strings(attribute_values, usecols=restrict_attribute_columns).items(): series = dd.from_array(np.array(values, dtype=np.str)) - ddf[column_name] = series + df[column_name] = series else: df = parse_gtf(filepath_or_buffer, chunksize=chunksize, features=features) @@ -311,7 +310,7 @@ def parse_gtf_and_expand_attributes(filepath_or_buffer, npartitions=None, compre usecols=restrict_attribute_columns).items(): df[column_name] = values - return ddf + return df def read_gtf(filepath_or_buffer, npartitions=None, compression=None, expand_attribute_column=True,