In [5]:
import pandas as pd 
from pathlib import Path
import urllib.request
import urllib.error
import urllib.parse

In [6]:
ftp_urls = [
    "ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/proteomes/proteome2taxid",
    "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/README"
]

print("Starting download process...")

for ftp_url in ftp_urls:
    print("-" * 40)

    parsed_url = urllib.parse.urlparse(ftp_url)
    url_path_str = parsed_url.path

    local_filename_str = Path(url_path_str).name

    local_path = Path(local_filename_str)

    full_local_path = local_path.resolve()
    print(f"Processing URL: {ftp_url}")
    print(f"Target local file: {full_local_path}")
    print(f"Checking existence...")

    if not local_path.exists():
        print(f"-> File '{local_path.name}' not found locally. Attempting download...")
        urllib.request.urlretrieve(ftp_url, local_path)
        print(f"-> Download complete! File saved as '{local_path}'.")
        
    else:
        print(f"-> File '{local_path.name}' already exists. Skipping download.")

print("-" * 40)
print("Download process finished.")

header_line = "Proteome_ID\tTax_ID\tOSCODE\tSUPERREGNUM\t#(1)\t#(2)\t#(3)\tSpecies Name"
found_header = False
lines_written = 0
input_filename = "README"
output_filename = "processed_README"

with open(input_filename, 'r', encoding='utf-8') as infile, \
        open(output_filename, 'w', encoding='utf-8') as outfile:

    for line in infile:
        stripped_line = line.strip()

        if found_header:
            if stripped_line == "":
                break
            else:
                outfile.write(line)
                lines_written += 1
        elif stripped_line == header_line:
            found_header = True
            outfile.write(line)
            lines_written += 1

if not found_header:
    print(f"Error: Header line not found in '{input_filename}'")
elif lines_written == 1 and found_header:
        print(f"Header found, but no data lines followed before the end or a blank line in '{input_filename}'. Output file '{output_filename}' contains only the header.")
elif lines_written > 1:
    print(f"Successfully extracted data to '{output_filename}'")


Starting download process...
----------------------------------------
Processing URL: ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/proteomes/proteome2taxid
Target local file: /storage/group/izg5139/default/lefteris/proteome2taxid
Checking existence...
-> File 'proteome2taxid' already exists. Skipping download.
----------------------------------------
Processing URL: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/README
Target local file: /storage/group/izg5139/default/lefteris/README
Checking existence...
-> File 'README' already exists. Skipping download.
----------------------------------------
Download process finished.
Successfully extracted data to 'processed_README'


In [7]:
uniprot_info = pd.read_csv(
    "processed_README",
    sep='\t'
    ).drop(columns= ["Proteome_ID","OSCODE","#(1)", "#(2)","#(3)"]).rename(columns={
        "Species Name": "Species_Name"
    })

uniprot_cel_orgs = uniprot_info[uniprot_info["SUPERREGNUM"] != "viruses"]

goa_info = pd.read_csv(
    "proteome2taxid",
    sep='\t',
    header = None,
    names = [
        "Species_Name",
        "Tax_ID",
        "GOA_file"
    ])

cellular_organisms = goa_info[goa_info['Tax_ID'].isin(uniprot_cel_orgs['Tax_ID'])]

In [24]:
cellular_organisms['index'] = cellular_organisms['GOA_file'].str.split('.').str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cellular_organisms['index'] = cellular_organisms['GOA_file'].str.split('.').str[0]


In [50]:
index_df = cellular_organisms[['index', 'Tax_ID']]
tax_id_dict = index_df.set_index('index')['Tax_ID'].to_dict()


In [None]:
background_pop = Path("/storage/group/izg5139/default/lefteris/background_pop")
outdir = Path("final_background_pop")

outdir.mkdir(parents=True, exist_ok=True)

column_names = [
    "DB",
    "DB_Object_ID",
    "DB_Object_Symbol",
    "Relation",
    "GO_ID",
    "DB:Reference_(|DB:Reference)",
    "Evidence_Code",
    "With_(or)_From",
    "Aspect",
    "DB_Object_Name",
    "DB_Object_Synonym_(|Synonym)",
    "DB_Object_Type",
    "Taxon_(|taxon)",
    "Date",
    "Assigned_By",
    "Annotation_Extension",
    "Gene_Product_Form_ID"]

dtypes = {
    "DB": "category",
    "DB_Object_ID": "category",
    "DB_Object_Symbol": "category",
    "Relation": "object",
    "GO_ID": "category",
    "DB:Reference_(|DB:Reference)": "category",
    "Evidence_Code": "category",
    "With_(or)_From": "object",
    "Aspect": "category",
    "DB_Object_Name": "category",
    "DB_Object_Synonym_(|Synonym)": "category",
    "DB_Object_Type": "category",
    "Taxon_(|taxon)": "category",
    "Date": "int64",
    "Assigned_By": "category",
    "Annotation_Extension": "object",
    "Gene_Product_Form_ID": "object",
}

def create_background(
    file,
    outdir,
    taxon_id,
    column_names,
    dtypes
    ):

    gaf = pd.read_csv(
        file,
        sep='\t',
        header=None,
        comment="!",
        names=column_names,
        dtype=dtypes
        )

    filtered_gaf = gaf[
        (gaf["DB"]=="UniProtKB") & 
        (~gaf["Relation"].str.contains('NOT', case=False, na=False)) & 
        (gaf["DB_Object_Type"]=="protein")
        ]
    
    filtered_gaf = filtered_gaf[['DB_Object_ID', 'GO_ID', 'Evidence_Code']].drop_duplicates()
    mod_gaf = outdir.joinpath(f"{taxon_id}_background.txt")
    
    filtered_gaf.to_csv(mod_gaf, sep = '\t', index = False, header=False)
    
    print(f"Saved background population for {taxon_id} to {mod_gaf}")

for file in background_pop.iterdir():
    if file.suffix == ".goa":
        index = file.name.split(".")[0]
        tax_id = tax_id_dict[index]
        create_background(
            file,
            outdir,
            tax_id,
            column_names,
            dtypes
        )

272557
192222
77020
1202724
1519489
36816
148814
36870
210007
623
189518
211586
196164
872965
272633
160488
199310
224911
224915
220668
183190
212717
1523422
1523424
203267
1684385
1523413
1523417
179993
157538
1523416
1690245
857265
1664068
1389006
10116
1523428
1523432
66876
1509431
1592327
1523425
1664694
1514904
264201
223283
226186
226185
227377
227882
228410
28189
1531966
37360
226900
5949
545501
35841
1169540
1677858
1476857
162
104259
1504672
1499687
481446
1034943
1461582
1109412
100787
456999
260936
1848903
690567
499207
710127
233413
235279
167539
282197
187304
282199
665126
420998
313367
39491
119224
360807
2378
1603897
301302
311410
203907
882
257313
298386
74547
243090
272561
233412
273121
242619
243365
251221
1716176
269536
70996
1300341
312540
36849
1472767
218284
1723761
243265
1134406
78410
229920
1302151
1605367
154981
471514
1544413
229921
186479
360411
8496
699431
1740090
869279
1610491
243230
1392
1527298
1620421
1120658
7227
83333
1712645
1736483
1736612
1736209


In [None]:
final_df = pd.merge(
    cellular_organisms,
    uniprot_cel_orgs[['Tax_ID', "SUPERREGNUM"]],
    on='Tax_ID',
    how='left'
)

eukaryotes = final_df[final_df['SUPERREGNUM'] == 'eukaryota']

In [None]:
final_df[final_df['Tax_ID'] == ]

Unnamed: 0,Species_Name,Tax_ID,GOA_file,index,SUPERREGNUM
0,'Catharanthus roseus' aster yellows phytoplasma,1193712,4203884.C_roseus_aster_yellows_phytoplasma.goa,4203884,bacteria
1,Aaosphaeria arxii CBS 175.79,1450172,4817555.A_arxii_CBS_17579.goa,4817555,eukaryota
2,Abditibacterium utsteinense,1960156,4151922.A_utsteinense.goa,4151922,bacteria
3,Abiotrophia defectiva ATCC 49176,592010,355803.A_defectiva_ATCC_49176.goa,355803,bacteria
4,Abrus precatorius (Indian licorice) (Glycine a...,3816,4706231.A_precatorius.goa,4706231,eukaryota
...,...,...,...,...,...
12126,methanotrophic endosymbiont of Bathymodiolus p...,343235,4390408.m_endosymbiont_of_Bathymodiolus_puteos...,4390408,bacteria
12127,secondary endosymbiont of Ctenarytaina eucalypti,1199245,283054.s_endosymbiont_of_Ctenarytaina_eucalypt...,283054,bacteria
12128,secondary endosymbiont of Heteropsylla cubana,134287,283056.s_endosymbiont_of_Heteropsylla_cubana.goa,283056,bacteria
12129,secondary endosymbiont of Trabutina mannipara,1835721,3812676.s_endosymbiont_of_Trabutina_mannipara.goa,3812676,bacteria


In [90]:
speed_benchmark_path = Path("/storage/group/izg5139/default/lefteris/goea_tool/speed_benchmark")
background_pop = Path("/storage/group/izg5139/default/lefteris/goea_tool/final_background_pop")

nums_to_sample = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
sample_dfs = {}

for num in nums_to_sample:
    sub_path = speed_benchmark_path.joinpath(f"sample_{num}")
    sub_path.mkdir(parents=True, exist_ok=True)

    sample_taxon_ids = eukaryotes.sample(n=num,random_state=42)['Tax_ID'].tolist()
    sample_dfs[num] = sample_taxon_ids
    
    for taxon_id in sample_taxon_ids:
        df = pd.read_csv(
            background_pop / f"{taxon_id}_background.txt",
            sep='\t',
            header=None
        )
        speed_df = df.sample(n=round(len(df) / 2),random_state=42)
        speed_df.to_csv(
            sub_path / f"{taxon_id}_sample.txt",
            sep = '\t',
            header=None,
            index=False
            )
