In [1]:
import os
import glob
import tqdm
import shutil
import pandas as pd
import zipfile
import multiprocessing as mp

cpu_count = mp.cpu_count()

In [2]:
SRA_path = "./SRA_zip/"

In [3]:
def df_to_fasta(df, path, tag="tag", data='data'):
    lines = []
    df.apply(lambda row: lines.append(f">{row[tag]}\n{row[data]}\n"),axis=1)
    with open(path, 'w') as file:        
        file.write(''.join(lines))

In [4]:
files = glob.glob(f"{SRA_path}*.zip")

In [5]:
result = pd.read_csv("result_level2_filter_clustered.csv")
print(result.shape)
result.head(2)

(1749, 139)


Unnamed: 0,Reference miRNA cluster,Reference miRNA IDs,Reference miRNA IDs and species,confidence,seq name,ct name,ct,pdf,hit start,hit end,...,distal closest to 21,distal closest to 36,Loop distal junction distance,Loop proximal junction distance,message,hit cluster number,boi cluster number,precursor cluster number,identical hit cluster,seed region
0,C5736,gma-miR408b-5p,gma-miR408b-5p MIMAT0021630 Glycine max miR408...,False,CM040440.1|+|17436091-17436510|201-220,Fold 01,"=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...","=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...",201,220,...,"['loop=dist:19, size:2 + 1']","['loop=dist:19, size:2 + 1']",0.0,15.0,-,['0005'],['0001'],['0001'],1,GGGGAACAGGCA
1,C5736,gma-miR408b-5p,gma-miR408b-5p MIMAT0021630 Glycine max miR408...,False,CM040440.1|+|17436091-17436510|201-220,Fold 04,"=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...","=HYPERLINK(""http://jupyter.sysmanager.ir/tree/...",201,220,...,"['loop=dist:19, size:2 + 1']","['loop=dist:19, size:2 + 1']",0.0,15.0,-,['0005'],['0002'],['0001'],1,GGGGAACAGGCA


In [6]:
result['precursor_unique'] = result['precursor name'].apply(lambda x: '|'.join(x.split("|")[:-1]))
pre_df = result[['precursor_unique', 'precursor seq']]
pre_df['precursor seq'] = pre_df['precursor seq'].apply(lambda x: x.lower())
pre_df = pre_df.drop_duplicates(subset=['precursor seq'], keep='first')
df_to_fasta(pre_df, "./precursor_query.fasta", tag="precursor_unique", data='precursor seq')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_df['precursor seq'] = pre_df['precursor seq'].apply(lambda x: x.lower())


In [7]:
hit_df = result[['hit position on chromosome', 'hit seq']]
hit_df = hit_df.drop_duplicates(subset=['hit seq'], keep='first')
df_to_fasta(hit_df, "./hit_query.fasta", tag="hit position on chromosome", data='hit seq')

In [8]:
header = "qseqid sseqid qstart qend sstart send qseq sseq evalue bitscore score length pident nident mismatch positive gapopen gaps ppos frames qframe sframe sstrand qcovs qcovhsp qlen slen"

# Main loop

In [None]:
for file in tqdm.tqdm(files):
    with zipfile.ZipFile(file , 'r') as zip_ref:
        zip_ref.extractall("./Temp_extract/")
    name = file[len(SRA_path):-4]
    for  query, output, word_size in [["./precursor_query.fasta", "precursor", 28], ["./hit_query.fasta", "hit",18]]:
        command = f'''blastn -query {query}
        -out ./Result/{output}/{name} 
        -num_threads {cpu_count} 
        -db ./Temp_extract/blastdb 
        -word_size {word_size} \
        -penalty -2 
        -reward 1 
        -gapopen 5 
        -gapextend 2 
        -evalue 0.001 
        -outfmt "6 {header}"
        '''.replace("\n", "")
        os.system(command)  
    shutil.rmtree('./Temp_extract')

# Process

## precursor

In [9]:
def precursor_check_functin(row, df):
    if(df['qseqid'].isin([row['precursor_unique']]).any()):        
        row['precursor expression'] = True
        row['precursor expression db'].append(name)
    return row
    
    
result['precursor expression'] = False
result['precursor expression db'] = result['precursor expression'].apply(lambda x: [])
for file in tqdm.tqdm(files):
    name = file[len(SRA_path):-4]
    df = pd.read_csv(f"./Result/precursor/{name}", sep="\t", header=None)
    df.columns = header.split(" ")    
    result = result.apply(lambda row: precursor_check_functin(row, df), axis=1)            

100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:27<00:00,  1.74it/s]


In [10]:
temp = result.copy()
temp['precursor seq'] = temp['precursor seq'].apply(lambda x: x.lower())
temp = temp.drop_duplicates(subset=['precursor seq'], keep='first')
print(sum(temp['precursor expression']))
print(temp.shape[0])

(268, 142)

## hit

In [18]:
def hit_check_functin(row, df):
    if(df['qseqid'].isin([row['hit position on chromosome']]).any()):        
        row['hit expression'] = True
        row['hit expression db'].append(name)
    return row
    
    
result['hit expression'] = False
result['hit expression db'] = result['hit expression'].apply(lambda x: [])
for file in tqdm.tqdm(files):
    name = file[len(SRA_path):-4]
    df = pd.read_csv(f"./Result/hit/{name}", sep="\t", header=None)
    df.columns = header.split(" ")        
    result = result.apply(lambda row: hit_check_functin(row, df), axis=1)            

  result['hit expression'] = False
  result['hit expression db'] = result['hit expression'].apply(lambda x: [])
100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:26<00:00,  1.82it/s]


In [19]:
temp = result.copy()
temp['hit seq'] = temp['hit seq'].apply(lambda x: x.lower())
temp = temp.drop_duplicates(subset=['hit seq'], keep='first')
print(sum(temp['hit expression']))
print(temp.shape[0])

201
401


In [22]:
result.to_csv("result_expression.csv",index=False)