## Read TAIR GFF file (into a pandas dataframe)

Source URL: https://www.arabidopsis.org/download/index-auto.jsp?dir=%2Fdownload_files%2FGenes%2FAraport11_genome_release

In [1]:
import pandas as pd
tair_df = pd.read_csv('Araport11_GFF3_genes_transposons.current.gff.gz', 
                      sep='\t', 
                      comment='#', 
                      encoding='cp1252', 
                      header=None, 
                     compression='gzip')

tair_df.columns = ['chromosome', 'data_source', 'type', 'start', 'end', 'score', 'strand', 'frame', 'features']

In [2]:
def string_to_dict(input_string):
    '''Function to convert feature string into a dictionary.'''
    try:
        # Split the input string into a list of key-value pairs using semicolon as the delimiter
        feature_list = input_string.split(';')

        # Filter the list to only include items that contain an equal sign ('=')
        feature_list = [x for x in feature_list if '=' in x]

        # Create a dictionary by splitting each key-value pair at the first equal sign encountered
        feature_dict = dict(x.split('=', 1) for x in feature_list)
    except:
        # If there's an exception (e.g., invalid input), print the original input string and return an empty dictionary
        print(input_string)
        feature_dict = dict()
    
    # Return the resulting dictionary
    return feature_dict

# The code provided assumes there's a variable 'inp_str' containing the input string
# You should replace 'inp_str' with an actual input string before calling the function.
# Example usage:
# input_string = "key1=value1;key2=value2;key3=value3"
# result = string_to_dict(input_string)
# This will convert the input string into a dictionary and store it in the 'result' variable.

In [3]:
tair_df['features_dict'] = tair_df.features.apply(string_to_dict)

nan
nan


In [4]:
# string_to_dict('ID=AT1G01010.1;Parent=AT1G01010;Name=AT1G01010.1;Note=NAC domain containing protein 1;conf_class=2;symbol=NAC001;full_name=NAC domain containing protein 1;computational_description=NAC domain containing protein 1;conf_rating=****;gene=2200934,UniProt=Q0WV96;curator_summary=Member of the NAC domain containing family of plant specific transcriptional regulators.')

In [5]:
tair_df.features_dict

0         {'ID': 'AT1G01010', 'Name': 'AT1G01010', 'Note...
1         {'ID': 'AT1G01010.1', 'Parent': 'AT1G01010', '...
2         {'ID': 'AT1G01010:CDS:1', 'Parent': 'AT1G01010...
3         {'ID': 'AT1G01010:CDS:2', 'Parent': 'AT1G01010...
4         {'ID': 'AT1G01010:CDS:3', 'Parent': 'AT1G01010...
                                ...                        
927370    {'ID': 'ATMG01380:exon:1', 'Parent': 'ATMG0138...
927371    {'ID': 'ATMG01380.1', 'Parent': 'ATMG01380', '...
927372    {'ID': 'ATMG01390', 'id2': 'gene-rrn18', 'Name...
927373    {'ID': 'ATMG01390:exon:1', 'Parent': 'ATMG0139...
927374    {'ID': 'ATMG01390.1', 'Parent': 'ATMG01390', '...
Name: features_dict, Length: 927375, dtype: object

# Filter TAIR entries

#### Filtering entries based on column values

In [6]:
tair_df[(tair_df.type == 'protein') & (tair_df.strand == '+')]

Unnamed: 0,chromosome,data_source,type,start,end,score,strand,frame,features,features_dict
15,Chr1,Araport11,protein,3760.0,5630.0,.,+,.,ID=AT1G01010.1-Protein;Name=AT1G01010.1;Derive...,"{'ID': 'AT1G01010.1-Protein', 'Name': 'AT1G010..."
200,Chr1,Araport11,protein,23519.0,31079.0,.,+,.,ID=AT1G01040.1-Protein;Name=AT1G01040.1;Derive...,"{'ID': 'AT1G01040.1-Protein', 'Name': 'AT1G010..."
244,Chr1,Araport11,protein,23519.0,31079.0,.,+,.,ID=AT1G01040.2-Protein;Name=AT1G01040.2;Derive...,"{'ID': 'AT1G01040.2-Protein', 'Name': 'AT1G010..."
603,Chr1,Araport11,protein,53022.0,54494.0,.,+,.,ID=AT1G01110.1-Protein;Name=AT1G01110.1;Derive...,"{'ID': 'AT1G01110.1-Protein', 'Name': 'AT1G011..."
617,Chr1,Araport11,protein,52239.0,54494.0,.,+,.,ID=AT1G01110.2-Protein;Name=AT1G01110.2;Derive...,"{'ID': 'AT1G01110.2-Protein', 'Name': 'AT1G011..."
...,...,...,...,...,...,...,...,...,...,...
927135,ChrC,Araport11,protein,139856.0,140650.0,.,+,.,ID=ATCG01230.1-Protein;Name=ATCG01230.1;Derive...,"{'ID': 'ATCG01230.1-Protein', 'Name': 'ATCG012..."
927140,ChrC,Araport11,protein,140704.0,141171.0,.,+,.,ID=ATCG01240.1-Protein;Name=ATCG01240.1;Derive...,"{'ID': 'ATCG01240.1-Protein', 'Name': 'ATCG012..."
927147,ChrC,Araport11,protein,141854.0,143708.0,.,+,.,ID=ATCG01250.1-Protein;Name=ATCG01250.1;Derive...,"{'ID': 'ATCG01250.1-Protein', 'Name': 'ATCG012..."
927168,ChrC,Araport11,protein,152506.0,152787.0,.,+,.,ID=ATCG01300.1-Protein;Name=ATCG01300.1;Derive...,"{'ID': 'ATCG01300.1-Protein', 'Name': 'ATCG013..."


# Filter TAIR entries based on feature(s)

#### Filtering entries that have five star (\*\*\*\*\*) confidence rating.

In [7]:
tair_df[tair_df.features_dict.apply(lambda x: x.get('conf_rating', '') == '*****')]

Unnamed: 0,chromosome,data_source,type,start,end,score,strand,frame,features,features_dict
481,Chr1,Araport11,mRNA,38752.0,40945.0,.,-,.,ID=AT1G01070.2;Parent=AT1G01070;Name=AT1G01070...,"{'ID': 'AT1G01070.2', 'Parent': 'AT1G01070', '..."
593,Chr1,Araport11,mRNA,52061.0,54689.0,.,+,.,ID=AT1G01110.1;Parent=AT1G01110;Name=AT1G01110...,"{'ID': 'AT1G01110.1', 'Parent': 'AT1G01110', '..."
620,Chr1,Araport11,mRNA,57164.0,59215.0,.,-,.,ID=AT1G01120.1;Parent=AT1G01120;Name=AT1G01120...,"{'ID': 'AT1G01120.1', 'Parent': 'AT1G01120', '..."
820,Chr1,Araport11,mRNA,75390.0,76845.0,.,+,.,ID=AT1G01180.1;Parent=AT1G01180;Name=AT1G01180...,"{'ID': 'AT1G01180.1', 'Parent': 'AT1G01180', '..."
846,Chr1,Araport11,mRNA,86486.0,88409.0,.,-,.,ID=AT1G01200.1;Parent=AT1G01200;Name=AT1G01200...,"{'ID': 'AT1G01200.1', 'Parent': 'AT1G01200', '..."
...,...,...,...,...,...,...,...,...,...,...
927130,ChrC,Araport11,mRNA,139856.0,140650.0,.,+,.,ID=ATCG01230.1;Parent=ATCG01230;Name=ATCG01230...,"{'ID': 'ATCG01230.1', 'Parent': 'ATCG01230', '..."
927137,ChrC,Araport11,mRNA,140704.0,141171.0,.,+,.,ID=ATCG01240.1;Parent=ATCG01240;Name=ATCG01240...,"{'ID': 'ATCG01240.1', 'Parent': 'ATCG01240', '..."
927152,ChrC,Araport11,mRNA,144921.0,145154.0,.,-,.,ID=ATCG01270.1;Parent=ATCG01270;Name=ATCG01270...,"{'ID': 'ATCG01270.1', 'Parent': 'ATCG01270', '..."
927157,ChrC,Araport11,mRNA,145291.0,152175.0,.,-,.,ID=ATCG01280.1;Parent=ATCG01280;Name=ATCG01280...,"{'ID': 'ATCG01280.1', 'Parent': 'ATCG01280', '..."


# Filter TAIR entries based on feature(s)

#### Filtering entries that are protein coding.

In [8]:
tair_df[tair_df.features_dict.apply(lambda x: x.get('locus_type', '') == 'protein_coding')].head(5)

Unnamed: 0,chromosome,data_source,type,start,end,score,strand,frame,features,features_dict
0,Chr1,Araport11,gene,3631.0,5899.0,.,+,.,ID=AT1G01010;Name=AT1G01010;Note=NAC domain co...,"{'ID': 'AT1G01010', 'Name': 'AT1G01010', 'Note..."
17,Chr1,Araport11,gene,6788.0,9130.0,.,-,.,ID=AT1G01020;Name=AT1G01020;Note=Arv1-like pro...,"{'ID': 'AT1G01020', 'Name': 'AT1G01020', 'Note..."
138,Chr1,Araport11,gene,11649.0,13714.0,.,-,.,ID=AT1G01030;Name=AT1G01030;Note=AP2/B3-like t...,"{'ID': 'AT1G01030', 'Name': 'AT1G01030', 'Note..."
157,Chr1,Araport11,gene,23121.0,31227.0,.,+,.,ID=AT1G01040;Name=AT1G01040;Note=dicer-like 1;...,"{'ID': 'AT1G01040', 'Name': 'AT1G01040', 'Note..."
249,Chr1,Araport11,gene,31170.0,33171.0,.,-,.,ID=AT1G01050;Name=AT1G01050;Note=pyrophosphory...,"{'ID': 'AT1G01050', 'Name': 'AT1G01050', 'Note..."


In [9]:
tair_df.features.iloc[0]

'ID=AT1G01010;Name=AT1G01010;Note=NAC domain containing protein 1;symbol=NAC001;full_name=NAC domain containing protein 1;computational_description=NAC domain containing protein 1;locus=2200935;locus_type=protein_coding'