In [27]:
import pandas as pd 
from Bio import Entrez 
from info_from_pmid import get_info_from_medline

In [28]:
from config import ncbi_email 
Entrez.email = ncbi_email


In [29]:
argeos_path = 'data/argeos_Ribosome_profiling_Human.tsv'

argeos = pd.read_csv(argeos_path, sep="\t")
print(argeos.SRA)

for i in range(len(list(argeos.Accession))):
    if argeos.at[i, "SRA"] != "None":
        argeos.at[i, "SRA"] = argeos.at[i, "SRA"].split("=")[1]

0      https://www.ncbi.nlm.nih.gov/sra?term=SRP233220
1                                                 None
2      https://www.ncbi.nlm.nih.gov/sra?term=SRP114759
3      https://www.ncbi.nlm.nih.gov/sra?term=SRP192590
4      https://www.ncbi.nlm.nih.gov/sra?term=SRP047065
                            ...                       
391    https://www.ncbi.nlm.nih.gov/sra?term=SRP119397
392    https://www.ncbi.nlm.nih.gov/sra?term=SRP133177
393                                               None
394                                               None
395                                               None
Name: SRA, Length: 396, dtype: object


In [30]:
geo_ribosome_profiling_path = 'data/geo_ribosomeprofiling.tsv'

geo_ribosome_profiling = pd.read_csv(geo_ribosome_profiling_path, sep='\t')
print(geo_ribosome_profiling.SRA)

0      SRP346338
1      SRP293929
2      SRP317900
3      SRP329509
4      SRP278667
         ...    
126    SRP009321
127    SRP007567
128    SRP002605
129    SRP003554
130    SRP000637
Name: SRA, Length: 131, dtype: object


In [31]:
all_accessions = []
for i in list(geo_ribosome_profiling.Accession):
    if i not in all_accessions:
        all_accessions.append(i)

for i in list(argeos.Accession):
    if i not in all_accessions:
        all_accessions.append(i)

print(geo_ribosome_profiling.index)
print(argeos.index)
print(len(all_accessions))

RangeIndex(start=0, stop=131, step=1)
RangeIndex(start=0, stop=396, step=1)
479


In [32]:
on = ["Accession", "Title", "Organism", "Samples", "SRA", "Release_Date", "Organism"]
superset = pd.merge(geo_ribosome_profiling, argeos, how='outer', on=on)


In [33]:
columns = ['Type']
index = [i for i, row in superset.iterrows()]
type_df = pd.DataFrame(index=index, columns=columns)
for i, row in superset.iterrows():
    if str(row['Type_x']) != 'nan':
        type_df.at[i, 'Type'] = str(row['Type_x'])
    else:
        type_df.at[i, 'Type'] = str(row['Type_y'])


In [34]:
superset = pd.concat([superset, type_df], axis=1)


In [35]:
for col in superset.columns:
    if 'Type_' in col:
        del superset[col]


In [36]:
import validators 


columns = ['GSE', 'GSE_Supplementary', 'BioProject']
index = [i for i, row in superset.iterrows()]
link_df = pd.DataFrame(index=index, columns=columns)

gse_supp_base = 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/'
gse_base = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="

for i, row in superset.iterrows():
    if str(row['Supplementary Links']) != 'nan':
        links = superset.at[i, 'Supplementary Links'].split(';')
        for link in links: 
            if "GSE" in link:
                gse_accession = link.split('/')[-2]

                link_df.at[i, 'GSE_Supplementary'] = link
                link_df.at[i, 'GSE'] = gse_base + gse_accession
            elif "PRJ" in link:
                link_df.at[i, "BioProject"] = link


    else:
        if "GSE" in row['Link']:
            gse_accession = row['Link'].split('=')[-1]
            link_df.at[i, 'GSE'] = row['Link']
            link_df.at[i, 'BioProject'] = row['BioProject link (NCBI)']
            if validators.url(gse_supp_base + gse_accession[:5] + "nnn/" + gse_accession + "/suppl"):
                link_df.at[i, 'GSE_Supplementary'] = gse_supp_base + gse_accession[:-3] + "nnn/" + gse_accession + "/suppl"
            else: 
                link_df.at[i,'GSE_Supplementary'] = ""

        else:
            link_df.at[i,'GSE'] = ""
            link_df.at[i,'GSE_Supplementary'] = ""
            link_df.at[i,'BioProject'] = ""


print(link_df.columns)

'''
This is still not working. The constructed supplementary links do not work. 
'''

Index(['GSE', 'GSE_Supplementary', 'BioProject'], dtype='object')


'\nThis is still not working. The constructed supplementary links do not work. \n'

In [37]:
superset = pd.concat([superset, link_df], axis=1)


In [38]:
for col in superset.columns:
    if col in ['Link', 
    'Supplementary Links', 
    'Supplementary Types', 
    'BioProject link (NCBI)', 
    'BioProject link (EBI)', 
    'All References', 
    'Platform', 
    'Type of molecule', 
    'Impact factor 2018', 
    'Summary'
    ]:
        del superset[col]


In [39]:
index = [i for i, row in superset.iterrows()]

paper_info_df = pd.DataFrame(columns=["PMID", "authors", "abstract", "title", "doi", "date_published", "PMC", "journal"], index=index)
for i, row in superset.iterrows():
    if str(row['PubMed ID']) != "nan":
        if paper_info_df.at[i, 'PMID'] != "nan":
            if len(str(row['PubMed ID'])) > 0:  
                print(str(row['PubMed ID']))    

                info_dict = get_info_from_medline(row['PubMed ID'])
                for item in info_dict:
                    paper_info_df.at[i, item] = info_dict[item]
    else:
        if str(row['doi or pubmed id']) != "nan":
            if "doi" in row['doi or pubmed id']:
                query = row['doi or pubmed id'].split('doi.org/')[-1]

                if len(str(query)) > 0:      
                    print(query)    
                    info_dict = get_info_from_medline(query)
                    for item in info_dict:
                        paper_info_df.at[i, item] = info_dict[item]
        else:
            continue
    print("index: \t", i)
    print(paper_info_df.iloc[i])
    print()



34899662
index: 	 0
PMID                                                       34899662
authors            Xuejing Fan,  Tianyu Bao,  Huaxi Yi,  Zongcai...
abstract          To determine whether osmotic pressure affects ...
title             Ribosome Profiling and RNA Sequencing Reveal G...
doi                                       10.3389/fmicb.2021.781454
date_published                                                 2021
PMC                                                      PMC8656396
journal                                   Frontiers in microbiology
Name: 0, dtype: object

34535544
index: 	 3
PMID                                                       34535544
authors            Srivats Venkataramanan,  Margaret Gadek,  Lor...
abstract          DDX3 is a DEAD-box RNA helicase that regulates...
title             DDX3X and DDX3Y are redundant in protein synth...
doi                                          10.1261/rna.078926.121
date_published                                      

In [40]:
print(paper_info_df.count())


paper_info_df.to_csv("data/ribo_seq_paper_info.csv", index=False)

PMID              448
authors           448
abstract          444
title             444
doi               448
date_published    448
PMC               405
journal           448
dtype: int64


In [41]:
superset = pd.concat([superset, paper_info_df], axis=1)

for col in superset.columns:
    if col in ['PubMed ID', 'doi or pubmed id', 'All references', 'Journal', 'Contact']:
        del superset[col] 


In [42]:
superset.to_csv("data/ribosome_profiling_superset.tsv", sep="\t", index=False)