In [2]:
import requests
import gzip
import numpy as np
import pandas as pd
from Bio import SeqIO

# 1. Loading genome

In [2]:
# Path to the genome FASTA file
GENOME_PATH = "hg38.fa" 

In [3]:
class DataSource:
    """
    Sourced from https://github.com/meuleman/SynthSeqs/blob/main/make_data/source.py
    A class to represent a source of reference genome data.

    Attributes
    ----------
    raw_data : str
        The raw genomic data in FASTA format.
    filepath : str
        The path to the file containing the genomic data.
    
    Methods
    -------
    data():
        Returns the raw genomic data.
    """
    def __init__(self, data, filepath):
        self.raw_data = data
        self.filepath = filepath

    @property
    def data(self):
        return self.raw_data

class DataSource:
    # Sourced from https://github.com/meuleman/SynthSeqs/blob/main/make_data/source.py

    def __init__(self, data, filepath):
        self.raw_data = data
        self.filepath = filepath

    @property
    def data(self):
        return self.raw_data


class ReferenceGenome(DataSource):
    """
    A class for quickly loading and querying the reference genome.
    
    Methods
    -------
    from_path(path):
        Class method to create a ReferenceGenome instance from a FASTA file.
    
    from_dict(data_dict):
        Class method to create a ReferenceGenome instance from a dictionary.
    
    genome():
        Property that returns the genome data.
    
    sequence(chrom, start, end):
        Returns the sequence of a specific chromosomal region.
    """

    @classmethod
    def from_path(cls, path):
        """
        Creates a ReferenceGenome instance from a FASTA file.

        Parameters
        ----------
        path : str
            The path to the FASTA file.

        Returns
        -------
        ReferenceGenome
            An instance of ReferenceGenome with the genome data loaded from the file.
        """
        genome_dict = {record.id: str(record.seq).upper() for record in SeqIO.parse(path, "fasta")}
        return cls(genome_dict, path)

    @classmethod
    def from_dict(cls, data_dict):
        """
        Creates a ReferenceGenome instance from a dictionary.

        Parameters
        ----------
        data_dict : dict
            A dictionary where keys are chromosome names and values are sequences.

        Returns
        -------
        ReferenceGenome
            An instance of ReferenceGenome with the genome data loaded from the dictionary.
        """
        return cls(data_dict, filepath=None)

    @property
    def genome(self):
        """
        Returns the genome data.

        Returns
        -------
        dict
            A dictionary where keys are chromosome names and values are sequences.
        """
        return self.data

    def sequence(self, chrom, start, end):
        """
        Returns the sequence of a specific chromosomal region.

        Parameters
        ----------
        chrom : str
            The name of the chromosome.
        start : int
            The starting position of the sequence.
        end : int
            The ending position of the sequence.

        Returns
        -------
        str
            The DNA sequence from the specified chromosomal region.

        Raises
        ------
        AssertionError
            If the end position is out of the chromosome sequence range.
        """
        chrom_sequence = self.genome[chrom]

        assert end < len(chrom_sequence), (
            f"Sequence position bound out of range for chromosome {chrom}. "
            f"{chrom} length {len(chrom_sequence)}, requested position {end}."
        )
        return chrom_sequence[start:end]


genome = ReferenceGenome.from_path(GENOME_PATH)

In [4]:
"""
The dataframe contains metadata related to biosamples and experimental details.
It includes information such as library order, biosample names, various IDs from DCC and Altius,
experimental protocols, biosample characteristics, and additional quantitative and qualitative metrics.
"""
DHS_Index_and_Vocabulary_metadata = pd.read_table('./DHS_Index_and_Vocabulary_metadata.tsv').iloc[:-1]
DHS_Index_and_Vocabulary_metadata.head()

Unnamed: 0,library order,Biosample name,Vocabulary representative,DCC Experiment ID,DCC Library ID,DCC Biosample ID,DCC File ID,Altius Aggregation ID,Altius Library ID,Altius Biosample ID,...,Library cleanup,DNaseI units/mL,Amount Nucleic Acid (ng),Nuclei count,Protease inhibitor,Library sequencing date,Reads used,DCC SPOT score,Per-biosample peaks,DHSs in Index
0,1.0,GM06990,,ENCSR000EMQ,ENCLB435ZZZ,ENCBS057ENC,ENCFF983CTQ,AG5636,LN1203,DS7748,...,Sucrose,,50.0,,,2009-02-23,142681590.0,0.679,83639.0,82918.0
1,2.0,HepG2,,ENCSR000ENP,ENCLB480ZZZ,ENCBS114ENC,ENCFF419JVG,AG5635,LN1207,DS7764,...,Sucrose,,50.0,,,2009-02-23,138826342.0,0.5858,89748.0,89235.0
2,3.0,hTH1,,ENCSR000EQC,ENCLB591ZZZ,ENCBS345AAA,ENCFF575KOF,AG5634,LN1222,DS7840,...,Sucrose,6.0,534.9,,,2007-06-06,149158633.0,0.647,94360.0,93665.0
3,4.0,Hela,,ENCSR000ENO,ENCLB479ZZZ,ENCBS890POO,ENCFF503PAE,AG4219,LN1264,DS8200,...,new Sucrose,4.0,50.0,,,2007-08-24,23372724.0,0.6444,59098.0,59024.0
4,5.0,CACO2,,ENCSR000EMI,ENCLB422ZZZ,ENCBS391ENC,ENCFF977BRD,AG4218,LN1269,DS8235,...,Sucrose,8.0,1.0,,,2007-09-05,22760059.0,0.719,29894.0,29724.0


In [5]:
DHS_Index_and_Vocabulary_metadata.columns

Index(['library order', 'Biosample name', 'Vocabulary representative',
       'DCC Experiment ID', 'DCC Library ID', 'DCC Biosample ID',
       'DCC File ID', 'Altius Aggregation ID', 'Altius Library ID',
       'Altius Biosample ID', 'Replicate indicators', 'System', 'Subsystem',
       'Organ', 'Biosample type', 'Biological state', 'Germ layer',
       'Description ', 'Growth stage', 'Age', 'Sex', 'Ethnicity', 'Donor ID',
       'Unique cellular condition', 'Used in Figure 1b', 'Biosample protocol',
       'Experiment protocol', 'Library kit method', 'Library cleanup',
       'DNaseI units/mL', 'Amount Nucleic Acid (ng)', 'Nuclei count',
       'Protease inhibitor', 'Library sequencing date', 'Reads used',
       'DCC SPOT score', 'Per-biosample peaks', 'DHSs in Index'],
      dtype='object')

In [6]:
# Load the NMF basis array from a .npy file
basis_array = np.load('2018-06-08NC16_NNDSVD_Basis.npy')

# Save the basis array to a CSV file for easier manipulation with pandas
np.savetxt("2018-06-08NC16_NNDSVD_Basis.csv", basis_array, delimiter=",")

# Load the NMF basis array from the CSV file into a pandas DataFrame
nmf_loadings = pd.read_csv('2018-06-08NC16_NNDSVD_Basis.csv', header=None)

# Rename columns for clarity, creating columns C1 to C16 for the 16 components
nmf_loadings.columns = ['C' + str(i) for i in range(1, 17)]


# Join the metadata DataFrame with the NMF loadings DataFrame
# This combines the metadata with the component presence matrix
DHS_Index_and_Vocabulary_metadata = pd.concat([DHS_Index_and_Vocabulary_metadata, nmf_loadings], axis=1)

In [7]:
DHS_Index_and_Vocabulary_metadata.head()

Unnamed: 0,library order,Biosample name,Vocabulary representative,DCC Experiment ID,DCC Library ID,DCC Biosample ID,DCC File ID,Altius Aggregation ID,Altius Library ID,Altius Biosample ID,...,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16
0,1.0,GM06990,,ENCSR000EMQ,ENCLB435ZZZ,ENCBS057ENC,ENCFF983CTQ,AG5636,LN1203,DS7748,...,0.0,0.0,0.0,0.102685,0.0,0.0,0.026774,0.0,0.0,0.0
1,2.0,HepG2,,ENCSR000ENP,ENCLB480ZZZ,ENCBS114ENC,ENCFF419JVG,AG5635,LN1207,DS7764,...,0.193557,0.0,0.074557,0.095928,0.0,0.0,3.190564,0.416094,0.0,0.0
2,3.0,hTH1,,ENCSR000EQC,ENCLB591ZZZ,ENCBS345AAA,ENCFF575KOF,AG5634,LN1222,DS7840,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,Hela,,ENCSR000ENO,ENCLB479ZZZ,ENCBS890POO,ENCFF503PAE,AG4219,LN1264,DS8200,...,0.155545,0.0,0.0,0.0,0.0,0.0,0.407768,0.113676,0.0,2.420549
4,5.0,CACO2,,ENCSR000EMI,ENCLB422ZZZ,ENCBS391ENC,ENCFF977BRD,AG4218,LN1269,DS8235,...,0.131876,0.0,0.0,0.0,0.0,0.0,0.936955,0.0,0.0,0.0


In [8]:
# Define the list of component column names
COMPONENT_COLUMNS = [
    'C1',
    'C2',
    'C3',
    'C4',
    'C5',
    'C6',
    'C7',
    'C8',
    'C9',
    'C10',
    'C11',
    'C12',
    'C13',
    'C14',
    'C15',
    'C16',
]

# Create a new column 'component' in the metadata DataFrame
# This column represents the index of the component with the highest value for each sample
DHS_Index_and_Vocabulary_metadata['Component'] = (
    DHS_Index_and_Vocabulary_metadata[COMPONENT_COLUMNS].idxmax(axis=1).apply(lambda x: int(x[1:]))
)

In [9]:
DHS_Index_and_Vocabulary_metadata.head()

Unnamed: 0,library order,Biosample name,Vocabulary representative,DCC Experiment ID,DCC Library ID,DCC Biosample ID,DCC File ID,Altius Aggregation ID,Altius Library ID,Altius Biosample ID,...,C8,C9,C10,C11,C12,C13,C14,C15,C16,Component
0,1.0,GM06990,,ENCSR000EMQ,ENCLB435ZZZ,ENCBS057ENC,ENCFF983CTQ,AG5636,LN1203,DS7748,...,0.0,0.0,0.102685,0.0,0.0,0.026774,0.0,0.0,0.0,5
1,2.0,HepG2,,ENCSR000ENP,ENCLB480ZZZ,ENCBS114ENC,ENCFF419JVG,AG5635,LN1207,DS7764,...,0.0,0.074557,0.095928,0.0,0.0,3.190564,0.416094,0.0,0.0,13
2,3.0,hTH1,,ENCSR000EQC,ENCLB591ZZZ,ENCBS345AAA,ENCFF575KOF,AG5634,LN1222,DS7840,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
3,4.0,Hela,,ENCSR000ENO,ENCLB479ZZZ,ENCBS890POO,ENCFF503PAE,AG4219,LN1264,DS8200,...,0.0,0.0,0.0,0.0,0.0,0.407768,0.113676,0.0,2.420549,16
4,5.0,CACO2,,ENCSR000EMI,ENCLB422ZZZ,ENCBS391ENC,ENCFF977BRD,AG4218,LN1269,DS8235,...,0.0,0.0,0.0,0.0,0.0,0.936955,0.0,0.0,0.0,1


# Create sequence metadata frame

In [10]:
# This matrix represents the contribution of each of the 16 NMF components to each of the 3.6 million DHSs
nmf_loadings = pd.read_csv('2018-06-08NC16_NNDSVD_Mixture.csv', header=None, names=COMPONENT_COLUMNS)

In [11]:
# Load the metadata for the 3.6 million DHSs
sequence_metadata = pd.read_table('./DHS_Index_and_Vocabulary_hg38_WM20190703.txt', sep='\t')

# Drop the 'component' column from the metadata DataFrame if it exists
sequence_metadata = sequence_metadata.drop(columns=['component'], axis=1)

# Merge the sequence metadata with the NMF loadings DataFrame
# This combines the metadata for each DHS with its corresponding NMF component contributions
df = pd.concat([sequence_metadata, nmf_loadings], axis=1, sort=False)

  sequence_metadata = pd.read_table('./DHS_Index_and_Vocabulary_hg38_WM20190703.txt', sep='\t')


In [12]:
df.head()

Unnamed: 0,seqname,start,end,identifier,mean_signal,numsamples,summit,core_start,core_end,C1,...,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16
0,chr1,16140,16200,1.10011,0.129388,1,16170,16170.0,16170.0,0.001439,...,4e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chr1,51868,52040,1.10021,0.080034,1,51970,51970.0,51970.0,0.0,...,0.011431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chr1,57280,57354,1.10025,0.273251,4,57350,57350.0,57350.0,0.0,...,0.0,0.025745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chr1,66370,66482,1.10027,0.183716,8,66430,66410.0,66430.0,0.001336,...,0.002904,0.001445,0.0,0.0,0.0,0.0,0.003885,0.0,0.0,0.0
4,chr1,79100,79231,1.1003,0.113049,2,79150,79150.0,79150.0,0.0,...,0.006965,0.0,0.000208,0.001768,0.003912,0.0,0.0,0.0,0.0,0.0


In [13]:
def sequence_bounds(summit: int, start: int, end: int, length: int):
    """
    Calculate the sequence coordinates (bounds) for a given DHS.
    
    Parameters
    ----------
    summit : int
        The summit position of the DHS.
    start : int
        The starting position of the DHS.
    end : int
        The ending position of the DHS.
    length : int
        The desired length of the sequence.
    
    Returns
    -------
    tuple
        A tuple containing the left and right bounds of the sequence.
    
    https://github.com/meuleman/SynthSeqs/blob/main/make_data/process.py
    """
    half = length // 2

    if (summit - start) < half:
        return start, start + length
    elif (end - summit) < half:
        return end - length, end

    return summit - half, summit + half

def add_sequence_column(df: pd.DataFrame, genome, length: int):
    """
    Query the reference genome for each DHS and add the raw sequences
    to the dataframe.
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe of DHS annotations and NMF loadings.
    genome : ReferenceGenome(DataSource)
        A reference genome object to query for sequences.
    length : int
        The desired length of the DHS sequence.
    
    Returns
    -------
    pd.DataFrame
        The dataframe with an added column 'sequence' containing the sequences for each DHS.
    
    https://github.com/meuleman/SynthSeqs/blob/main/make_data/process.py
    """
    seqs = []
    for rowi, row in df.iterrows():
        # Determine the left and right bounds of the sequence
        l, r = sequence_bounds(row['summit'], row['start'], row['end'], length)
        
        # Query the genome to get the sequence for the given chromosomal region
        seq = genome.sequence(row['seqname'], l, r)
        
        # Append the sequence to the list
        seqs.append(seq)

    # Add the sequences as a new column in the dataframe
    df['sequence'] = seqs
    return df

In [14]:
# Recreating some of the columns from our original dataset

# Determine the component with the highest value for each DHS and create a 'component' column
# The component is determined by finding the column with the maximum value in the COMPONENT_COLUMNS
df['component'] = df[COMPONENT_COLUMNS].idxmax(axis=1).apply(lambda x: int(x[1:]))

# Calculate the proportion of the maximum component value to the sum of all component values for each DHS
# This represents how dominant the highest component is relative to the others
df['proportion'] = df[COMPONENT_COLUMNS].max(axis=1) / df[COMPONENT_COLUMNS].sum(axis=1)

# Calculate the total signal for each DHS by multiplying the mean signal by the number of samples
df['total_signal'] = df['mean_signal'] * df['numsamples']

# Calculate the proportion again (this line seems redundant as it is the same as the previous calculation)
df['proportion'] = df[COMPONENT_COLUMNS].max(axis=1) / df[COMPONENT_COLUMNS].sum(axis=1)

# Create a unique identifier for each DHS by concatenating the chromosome name, start, end, and summit positions
# The identifier is in the format 'seqname_start_end_summit'
df['dhs_id'] = df[['seqname', 'start', 'end', 'summit']].apply(lambda x: '_'.join(map(str, x)), axis=1)

# Calculate the width of each DHS by subtracting the start position from the end position
df['DHS_width'] = df['end'] - df['start']

In [15]:
# Creating sequence column
df = add_sequence_column(df, genome, 200)

# Changing seqname column to chr
df = df.rename(columns={'seqname': 'chr'})

# Reordering and unselecting columns
df = df[
    [
        'dhs_id',
        'chr',
        'start',
        'end',
        'DHS_width',
        'summit',
        'numsamples',
        'total_signal',
        'component',
        'proportion',
        'sequence',
    ]
]

In [16]:
df.head()

Unnamed: 0,dhs_id,chr,start,end,DHS_width,summit,numsamples,total_signal,component,proportion,sequence
0,chr1_16140_16200_16170,chr1,16140,16200,60,16170,1,0.129388,1,0.855153,CGGGCATCCTGTGTGCAGATACTCCCTGCTTCCTCTCTAGCCCCCA...
1,chr1_51868_52040_51970,chr1,51868,52040,172,51970,1,0.080034,7,0.973545,GGCGACCCAGCGAGACTCCGCCTCAAAAAAAAAAAAAGAAGATTGA...
2,chr1_57280_57354_57350,chr1,57280,57354,74,57350,4,1.093002,8,1.0,CTCAGTCATTCCGAACAATTCACACACTAAGATTACCCATGCTAAA...
3,chr1_66370_66482_66430,chr1,66370,66482,112,66430,8,1.469725,3,0.332213,ATATATAAATTATATAATATAATATATATTATATAATATAATATAT...
4,chr1_79100_79231_79150,chr1,79100,79231,131,79150,2,0.226098,7,0.50184,CATTTCTCCAAGGAGGAAATACCAGAGTCAATTCACAACCACTGCA...


In [17]:
# Load the binary matrix
# This matrix represents the presence (1) or absence (0) of DHS peaks in 733 biosamples
binary_matrix = pd.read_table('./dat_bin_FDR01_hg38.txt', header=None)

# Collecting names of cells into a list with format celltype_encodeID

# Create a list of cell type identifiers by concatenating 'Biosample name' and 'DCC Library ID'
# The resulting format for each cell type identifier is 'BiosampleName_DCCLibraryID'
celltype_encodeID = [
    row['Biosample name'] + "_" + row['DCC Library ID'] for _, row in DHS_Index_and_Vocabulary_metadata.iterrows()
]

# Renaming columns using celltype_encodeID list

# Rename the columns of the binary matrix using the celltype_encodeID list
# Each column now corresponds to a specific cell type and library ID combination
binary_matrix.columns = celltype_encodeID

In [18]:
binary_matrix.head()

Unnamed: 0,GM06990_ENCLB435ZZZ,HepG2_ENCLB480ZZZ,hTH1_ENCLB591ZZZ,Hela_ENCLB479ZZZ,CACO2_ENCLB422ZZZ,CACO2_ENCLB423ZZZ,SKNSH_ENCLB585ZZZ,SKNSH_ENCLB586ZZZ,HMEC_ENCLB493ZZZ,GM12878_ENCLB441ZZZ,...,fKidney_ENCLB005SRL,fKidney_ENCLB704GMQ,fKidney_ENCLB759USM,fLung_ENCLB594BSZ,fKidney_ENCLB049MNH,fUmbilical_cord_ENCLB771UER,fBone_femur_ENCLB236BWV,fLiver_ENCLB638FEH,fPlacenta_ENCLB423VBC,fPlacenta_ENCLB711ZZZ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [19]:
master_dataset = pd.concat([df, binary_matrix], axis=1, sort=False)

master_dataset.head()

Unnamed: 0,dhs_id,chr,start,end,DHS_width,summit,numsamples,total_signal,component,proportion,...,fKidney_ENCLB005SRL,fKidney_ENCLB704GMQ,fKidney_ENCLB759USM,fLung_ENCLB594BSZ,fKidney_ENCLB049MNH,fUmbilical_cord_ENCLB771UER,fBone_femur_ENCLB236BWV,fLiver_ENCLB638FEH,fPlacenta_ENCLB423VBC,fPlacenta_ENCLB711ZZZ
0,chr1_16140_16200_16170,chr1,16140,16200,60,16170,1,0.129388,1,0.855153,...,0,0,0,0,0,0,0,0,0,0
1,chr1_51868_52040_51970,chr1,51868,52040,172,51970,1,0.080034,7,0.973545,...,0,0,0,0,0,0,0,0,0,0
2,chr1_57280_57354_57350,chr1,57280,57354,74,57350,4,1.093002,8,1.0,...,0,0,0,0,0,0,0,0,0,0
3,chr1_66370_66482_66430,chr1,66370,66482,112,66430,8,1.469725,3,0.332213,...,0,0,0,0,0,0,0,0,0,0
4,chr1_79100_79231_79150,chr1,79100,79231,131,79150,2,0.226098,7,0.50184,...,0,0,0,1,0,0,0,0,0,0


In [20]:
# Save as feather file
# master_dataset.to_feather('master_dataset.ftr')

In [3]:
master_dataset = pd.read_feather('./master_dataset.ftr')

In [11]:
master_dataset

Unnamed: 0,dhs_id,chr,start,end,DHS_width,summit,numsamples,total_signal,component,proportion,...,fKidney_ENCLB005SRL,fKidney_ENCLB704GMQ,fKidney_ENCLB759USM,fLung_ENCLB594BSZ,fKidney_ENCLB049MNH,fUmbilical_cord_ENCLB771UER,fBone_femur_ENCLB236BWV,fLiver_ENCLB638FEH,fPlacenta_ENCLB423VBC,fPlacenta_ENCLB711ZZZ
0,chr1_16140_16200_16170,chr1,16140,16200,60,16170,1,0.129388,1,0.855153,...,0,0,0,0,0,0,0,0,0,0
1,chr1_51868_52040_51970,chr1,51868,52040,172,51970,1,0.080034,7,0.973545,...,0,0,0,0,0,0,0,0,0,0
2,chr1_57280_57354_57350,chr1,57280,57354,74,57350,4,1.093002,8,1.000000,...,0,0,0,0,0,0,0,0,0,0
3,chr1_66370_66482_66430,chr1,66370,66482,112,66430,8,1.469725,3,0.332213,...,0,0,0,0,0,0,0,0,0,0
4,chr1_79100_79231_79150,chr1,79100,79231,131,79150,2,0.226098,7,0.501840,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3591893,chrY_56882540_56882719_56882610,chrY,56882540,56882719,179,56882610,1,0.038079,5,0.803229,...,0,0,0,0,0,0,0,0,0,0
3591894,chrY_56882864_56882980_56882930,chrY,56882864,56882980,116,56882930,1,0.115489,5,0.742349,...,0,0,0,0,0,0,0,0,0,0
3591895,chrY_56883733_56883960_56883830,chrY,56883733,56883960,227,56883830,5,2.456885,7,0.559734,...,0,0,0,0,0,0,0,0,0,0
3591896,chrY_56884440_56884580_56884510,chrY,56884440,56884580,140,56884510,1,0.053759,5,0.803229,...,0,0,0,0,0,0,0,0,0,0


In [13]:
master_dataset.columns[:20]

Index(['dhs_id', 'chr', 'start', 'end', 'DHS_width', 'summit', 'numsamples',
       'total_signal', 'component', 'proportion', 'sequence',
       'GM06990_ENCLB435ZZZ', 'HepG2_ENCLB480ZZZ', 'hTH1_ENCLB591ZZZ',
       'Hela_ENCLB479ZZZ', 'CACO2_ENCLB422ZZZ', 'CACO2_ENCLB423ZZZ',
       'SKNSH_ENCLB585ZZZ', 'SKNSH_ENCLB586ZZZ', 'HMEC_ENCLB493ZZZ'],
      dtype='object')