# BlastN

### makeblastdb

In [None]:
makeblastdb -in ecoli.fasta -parse_seqids -dbtype nucl

In [None]:
makeblastdb -in /mnt/d/Lab/16S-Taxa-Phlyo/database/SILVA_138.1_SSURef_NR99_tax_silva.fasta -parse_seqids -dbtype nucl

### balstn parameters

In [None]:
blastn  -query genes.fasta  -subject genome.fasta  -outfmt 6

```shell
blastn  -query merged.fasta -db /mnt/d/Lab/16S-Taxa-Phlyo/database/SILVA_138.1_SSURef_NR99_tax_silva.fasta -outfmt 6 -out blstnOut.tsv -num_threads 6 -evalue 1.0 -max_target_seqs 1
```

In [8]:
db = "/mnt/d/Lab/16S-Taxa-Phlyo/database/SILVA_138.1_SSURef_NR99_tax_silva.fasta"

In [9]:
def RunBlastnParallel(fastaList, db, jobs, threads, OutDir):
    pool = Pool(processes=jobs)
    pool.starmap(RunBlastn, zip(fastaList, repeat(db), repeat(threads), repeat(OutDir)))
    pool.close()
    pool.join()
    pool.terminate()

def RunBlastn(fasta, db, threads, OutDir):
    OutFile = os.path.join(OutDir, os.path.split()[1].replace(".fasta", "") + "_blast.tsv")
    cmd = "blastn -query " + fasta  + " -out " + OutFile + " -evalue 1.0 -max_target_seqs 1 -outfmt 6 -db " + db + " -num_threads " + str(threads) 
    subprocess.call(cmd, shell=True)

### index gen

In [4]:
import pandas as pd

In [1]:
from Bio import SeqIO

In [2]:
fasta = "/mnt/d/Lab/16S-Taxa-Phlyo/database/SILVA_138.1_SSURef_NR99_tax_silva.fasta"

In [98]:
from Bio import SeqIO

record_iterator = SeqIO.parse(fasta, "fasta")

first_record = next(record_iterator)
print(first_record.id)
print(first_record.name)
print(first_record.description.replace(first_record.id + " " , ""))

AY846379.1.1791
AY846379.1.1791
Eukaryota;Archaeplastida;Chloroplastida;Chlorophyta;Chlorophyceae;Sphaeropleales;Monoraphidium;Monoraphidium sp. Itas 9/21 14-6w


In [99]:
idList = []
taxaList = []
for seq in SeqIO.parse(fasta, "fasta"):
    idList.append(seq.id)
    taxaList.append(seq.description.replace(seq.id + " " , ""))

In [100]:
len(idList)

510508

In [101]:
len(taxaList)

510508

In [104]:
idList[0]

'AY846379.1.1791'

In [103]:
taxaList[0]

'Eukaryota;Archaeplastida;Chloroplastida;Chlorophyta;Chlorophyceae;Sphaeropleales;Monoraphidium;Monoraphidium sp. Itas 9/21 14-6w'

In [106]:
!head /mnt/d/Lab/16S-Taxa-Phlyo/database/SILVA_138.1_SSURef_NR99_tax_silva.fasta

>AY846379.1.1791 Eukaryota;Archaeplastida;Chloroplastida;Chlorophyta;Chlorophyceae;Sphaeropleales;Monoraphidium;Monoraphidium sp. Itas 9/21 14-6w
AACCUGGUUGAUCCUGCCAGUAGUCAUAUGCUUGUCUCAAAGAUUAAGCCAUGCAUGUCUAAGUAUAAACUGCUUAUACU
GUGAAACUGCGAAUGGCUCAUUAAAUCAGUUAUAGUUUAUUUGAUGGUACCUCUACACGGAUAACCGUAGUAAUUCUAGA
GCUAAUACGUGCGUAAAUCCCGACUUCUGGAAGGGACGUAUUUAUUAGAUAAAAGGCCGACCGAGCUUUGCUCGACCCGC
GGUGAAUCAUGAUAACUUCACGAAUCGCAUAGCCUUGUGCUGGCGAUGUUUCAUUCAAAUUUCUGCCCUAUCAACUUUCG
AUGGUAGGAUAGAGGCCUACCAUGGUGGUAACGGGUGACGGAGGAUUAGGGUUCGAUUCCGGAGAGGGAGCCUGAGAAAC
GGCUACCACAUCCAAGGAAGGCAGCAGGCGCGCAAAUUACCCAAUCCUGAUACGGGGAGGUAGUGACAAUAAAUAACAAU
GCCGGGCAUUUCAUGUCUGGCAAUUGGAAUGAGUACAAUCUAAAUCCCUUAACGAGGAUCAAUUGGAGGGCAAGUCUGGU
GCCAGCAGCCGCGGUAAUUCCAGCUCCAAUAGCGUAUAUUUAAGUUGUUGCAGUUAAAAAGCUCGUAGUUGGAUUUCGGG
UGGGUUCCAGCGGUCCGCCUAUGGUGAGUACUGCUGUGGCCCUCCUUUUUGUCGGGGACGGGCUCCUGGGCUUCAUUGUC


In [107]:
df = pd.DataFrame()

In [108]:
df["ID"] = idList
df["Taxa"] = taxaList

In [109]:
df

Unnamed: 0,ID,Taxa
0,AY846379.1.1791,Eukaryota;Archaeplastida;Chloroplastida;Chloro...
1,AB001445.1.1538,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...
2,AY929368.1.1768,Eukaryota;Archaeplastida;Chloroplastida;Charop...
3,KM209255.204.1909,Bacteria;Proteobacteria;Gammaproteobacteria;En...
4,AY955002.1.1727,Eukaryota;Archaeplastida;Chloroplastida;Chloro...
...,...,...
510503,MVBC01000034.3.1520,Bacteria;Bacteroidota;Bacteroidia;Cytophagales...
510504,JQIO01000430.1097739.1099039,Bacteria;Cyanobacteria;Cyanobacteriia;Chloropl...
510505,JQHZ01065714.557.1975,Bacteria;Cyanobacteria;Cyanobacteriia;Chloropl...
510506,JQIN01002109.43002.44826,Eukaryota;Archaeplastida;Chloroplastida;Charop...


In [111]:
df.to_csv("silva-138-99-index.tsv", index=None, sep="\t")

In [112]:
df = pd.read_table("silva-138-99-index.tsv")

In [113]:
df

Unnamed: 0,ID,Taxa
0,AY846379.1.1791,Eukaryota;Archaeplastida;Chloroplastida;Chloro...
1,AB001445.1.1538,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...
2,AY929368.1.1768,Eukaryota;Archaeplastida;Chloroplastida;Charop...
3,KM209255.204.1909,Bacteria;Proteobacteria;Gammaproteobacteria;En...
4,AY955002.1.1727,Eukaryota;Archaeplastida;Chloroplastida;Chloro...
...,...,...
510503,MVBC01000034.3.1520,Bacteria;Bacteroidota;Bacteroidia;Cytophagales...
510504,JQIO01000430.1097739.1099039,Bacteria;Cyanobacteria;Cyanobacteriia;Chloropl...
510505,JQHZ01065714.557.1975,Bacteria;Cyanobacteria;Cyanobacteriia;Chloropl...
510506,JQIN01002109.43002.44826,Eukaryota;Archaeplastida;Chloroplastida;Charop...


In [114]:
df["Taxa"][0]

'Eukaryota;Archaeplastida;Chloroplastida;Chlorophyta;Chlorophyceae;Sphaeropleales;Monoraphidium;Monoraphidium sp. Itas 9/21 14-6w'

In [115]:
taxa = df["Taxa"][0]

In [116]:
taxa.split(";")

['Eukaryota',
 'Archaeplastida',
 'Chloroplastida',
 'Chlorophyta',
 'Chlorophyceae',
 'Sphaeropleales',
 'Monoraphidium',
 'Monoraphidium sp. Itas 9/21 14-6w']

需要把Euk and Arc 去掉吗？