In [1]:
import dinuq
import csv
from Bio import SeqIO

In [2]:
def remove_stop_codons(sequence):
    codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]
    filtered_codons = [codon for codon in codons if codon.upper() not in {'TAA', 'TAG', 'TGA'}]
    return ''.join(filtered_codons)

def process_fasta(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for record in SeqIO.parse(infile, 'fasta'):
            # Check if sequence length is a multiple of 3
            if len(record.seq) % 3 == 0:
                # Remove stop codons from each sequence
                filtered_sequence = remove_stop_codons(str(record.seq))
                
                # Write the modified sequence to the output file
                outfile.write(f'>{record.id}\n{filtered_sequence}\n')

def write_nt_tsv(nt_dict, outfile):
    # Write the dictionary to a TSV file
    with open(outfile, 'w', newline='') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
    
        # Write header
        header = ["acc"] + list(list(nt_dict.values())[0].keys())
        writer.writerow(header)
    
        # Write data
        for key, values in nt_dict.items():
            row = [key] + list(values.values())
            writer.writerow(row)
    
    print(f"Data written to {outfile}")


In [3]:
alldns = ['CpC', 'CpG', 'CpU', 'CpA', 'GpC', 'GpG', 'GpU', 'GpA', 'UpC', 'UpG', 'UpU', 'UpA', 'ApC', 'ApG', 'ApU', 'ApA']

## Rhabdoviridae

### RDA

In [17]:
rda = dinuq.RDA("dinuq/rhabdo/rhabdo_genomes.fasta", alldns)

In [18]:
rda

{'NC_043065.1': {'CpC': 1.199736533601869,
  'CpG': 0.5034967236821848,
  'CpU': 1.0255098179191728,
  'CpA': 1.187782636980836,
  'GpC': 0.714362872916604,
  'GpG': 1.2245096033154415,
  'GpU': 0.7506439944000604,
  'GpA': 1.2525907669196752,
  'UpC': 1.2735089242133517,
  'UpG': 1.10502527930996,
  'UpU': 0.9935297776665765,
  'UpA': 0.7542800210881181,
  'ApC': 0.8214019230862496,
  'ApG': 1.082170254413597,
  'ApU': 1.1612815573937303,
  'ApA': 0.9185264271421355},
 'NC_079052.1': {'CpC': 1.070783323891768,
  'CpG': 0.5457369208273218,
  'CpU': 1.1772972988013726,
  'CpA': 1.1646483887954484,
  'GpC': 0.831089559168405,
  'GpG': 0.9517796499500867,
  'GpU': 0.8745103444907787,
  'GpA': 1.26073095453367,
  'UpC': 1.3125051118289812,
  'UpG': 1.1278731545768923,
  'UpU': 0.9470519217866223,
  'UpA': 0.7339755904330341,
  'ApC': 0.8146801537386881,
  'ApG': 1.2299236191785237,
  'ApU': 1.029756798517988,
  'ApA': 0.91289996100619},
 'NC_038755.1': {'CpC': 1.1500401905488558,
  'CpG': 

In [19]:
dinuq.RDA_to_tsv(rda, "dinuq/rhabdo/dinuq_rhabdo_rda2.tsv")

acc	CpC	CpG	CpU	CpA	GpC	GpG	GpU	GpA	UpC	UpG	UpU	UpA	ApC	ApG	ApU	ApA
NC_043065.1	1.199736533601869	0.5034967236821848	1.0255098179191728	1.187782636980836	0.714362872916604	1.2245096033154415	0.7506439944000604	1.2525907669196752	1.2735089242133517	1.10502527930996	0.9935297776665765	0.7542800210881181	0.8214019230862496	1.082170254413597	1.1612815573937303	0.9185264271421355
NC_079052.1	1.070783323891768	0.5457369208273218	1.1772972988013726	1.1646483887954484	0.831089559168405	0.9517796499500867	0.8745103444907787	1.26073095453367	1.3125051118289812	1.1278731545768923	0.9470519217866223	0.7339755904330341	0.8146801537386881	1.2299236191785237	1.029756798517988	0.91289996100619
NC_038755.1	1.1500401905488558	0.3532173356814723	0.9882023395976891	1.4608796249086324	0.814627368778891	1.1117082539582634	0.9071182925546517	1.1116117772740077	1.1745300534612266	1.3959844382427875	0.8263819904093123	0.6786036325878004	0.9078912923160992	1.0330129647394821	1.232757464043582	0.8445747177740545

### SDUc

In [22]:
# Specify input and output file paths
input_file_path = 'dinuq/rhabdo/OVRV1_cdsUnpaired.fasta'
output_file_path = 'dinuq/rhabdo/OVRV1_no_stop.fasta'

# Process the FASTA file and remove stop codons
process_fasta(input_file_path, output_file_path)

In [23]:
OVRV1_sdu=dinuq.SDUc("dinuq/rhabdo/OVRV1_no_stop.fasta", alldns)

In [24]:
process_fasta("dinuq/rhabdo/blackfly_rhabdo.fasta", "dinuq/rhabdo/blackfly_rhabdo_no_stop.fasta")
blackfly_sdu=dinuq.SDUc("dinuq/rhabdo/blackfly_rhabdo_no_stop.fasta", alldns)


\(*'o'*)/	Sequence blackfly_rhabdo2_cds has ambiguous nucleotides!
	Ambiguous codons will be removed, but that might affect bridge dinucleotide calculations.


In [25]:
process_fasta("dinuq/rhabdo/rhabdo_refseq_cds_concat.fasta", "dinuq/rhabdo/rhabdo_refseq_cds_no_stop.fasta")
refseq_sdu=dinuq.SDUc("dinuq/rhabdo/rhabdo_refseq_cds_no_stop.fasta", alldns)


\(*'o'*)/	Sequence NC_076145.1 has ambiguous nucleotides!
	Ambiguous codons will be removed, but that might affect bridge dinucleotide calculations.

\(*'o'*)/	Sequence NC_077130.1 has ambiguous nucleotides!
	Ambiguous codons will be removed, but that might affect bridge dinucleotide calculations.

\(*'o'*)/	Sequence NC_013955.1 has ambiguous nucleotides!
	Ambiguous codons will be removed, but that might affect bridge dinucleotide calculations.

\(*'o'*)/	Sequence NC_075304.1 has ambiguous nucleotides!
	Ambiguous codons will be removed, but that might affect bridge dinucleotide calculations.

\(*'o'*)/	Sequence NC_076208.1 has ambiguous nucleotides!
	Ambiguous codons will be removed, but that might affect bridge dinucleotide calculations.

\(*'o'*)/	Sequence NC_020810.1 has ambiguous nucleotides!
	Ambiguous codons will be removed, but that might affect bridge dinucleotide calculations.

\(*'o'*)/	Sequence NC_031305.1 has ambiguous nucleotides!
	Ambiguous codons will be removed, but th

In [26]:
sdu={**refseq_sdu, **OVRV1_sdu, **blackfly_sdu}

In [27]:
dinuq.dict_to_tsv(sdu, "dinuq/rhabdo/dinuq_rhabdo_cds_sdu.tsv")

acc	CpCpos2	CpCbridge	CpGpos1	CpGpos2	CpGbridge	CpUpos1	CpUpos2	CpUbridge	CpApos2	CpAbridge	GpCpos2	GpCbridge	GpGpos2	GpGbridge	GpUpos2	GpUbridge	GpApos2	GpAbridge	UpCpos1	UpCpos2	UpCbridge	UpGpos2	UpGbridge	UpUpos1	UpUpos2	UpUbridge	UpApos2	UpAbridge	ApCpos2	ApCbridge	ApGpos1	ApGpos2	ApGbridge	ApUpos2	ApUbridge	ApApos2	ApAbridge
NC_076145.1	1.2196964337561305	1.1288968254287295	0.4942131457160614	0.5021505047522821	0.38917756428375333	0.9335864768297265	1.342634093411316	1.168234320978466	1.148774432136888	1.254983426498328	0.5777937884694956	0.6508684226248797	1.3823744626801597	1.1690792764868838	0.5915805278193452	1.050001907566434	1.2052973897750523	1.1715070662151945	1.2068471980469455	1.0750994373243445	0.9048614038800774	1.3006183668820266	1.1663241513992249	1.0700358414669808	0.825716309471835	0.822876522983215	0.8480570906652485	0.7343479926170704	0.980166924438128	0.7219787786904314	1.0614816554508155	1.1716908590316764	1.085019121658157	1.0155787584274583	1.1970458452626007

In [28]:
refseq_nt=dinuq.ntcont("dinuq/rhabdo/rhabdo_refseq_cds_no_stop.fasta")
OVRV1_nt=dinuq.ntcont("dinuq/rhabdo/OVRV1_no_stop.fasta")
blackfly_nt=dinuq.ntcont("dinuq/rhabdo/blackfly_rhabdo_no_stop.fasta")

In [29]:
nt={**refseq_nt, **OVRV1_nt, **blackfly_nt}

In [30]:
# Specify the output TSV file path
output_file_path = "dinuq/rhabdo/rhabdo_refseq_ntcont.tsv"
write_nt_tsv(nt, output_file_path)

Data written to dinuq/rhabdo/rhabdo_refseq_ntcont.tsv


## Flaviviridae

### RDA

In [20]:
rda = dinuq.RDA("dinuq/flavi/flavi_genome.fasta", alldns)

In [21]:
dinuq.RDA_to_tsv(rda, "dinuq/flavi/dinuq_flavi_rda.tsv")

acc	CpC	CpG	CpU	CpA	GpC	GpG	GpU	GpA	UpC	UpG	UpU	UpA	ApC	ApG	ApU	ApA
KC505248.1	1.0829642574352483	0.8866294818401511	0.8993524969319768	1.1508926542950155	0.9696472235854836	0.9312261102599892	0.9179147451253474	1.174209296460441	0.8656019617270241	1.300498184773237	1.2068175542590043	0.589038056989142	1.0910824848592116	0.8901053823334973	0.9801717932438369	1.065522740392771
AF160193.1	1.1036550862374745	0.4105743332906042	1.264290084647077	1.2810686896898733	0.9125790853392002	1.0423599490356221	0.8299057835227444	1.171151510763857	1.0300887816826265	1.4205594492731657	1.0614839512804366	0.5052854713965713	0.9835856248357148	1.0364172661627054	0.9181588506793154	1.0459982939447559
GQ165809.2	1.119422180737322	0.8287643759051583	0.9313134250990047	1.1382369206665792	0.9074329590484737	1.019418104223347	0.8789034542625913	1.171658323619923	0.9707910654692902	1.2374610823697183	1.2080257756709576	0.5934064540330236	1.0245880729809762	0.8833593664140508	0.9667231903730479	1.1238185013852

### SDUc

In [None]:
process_fasta("dinuq/flavi/flavi_cds.fasta", "dinuq/flavi/flavi_cds_no_stop.fasta")
flavi_sdu=dinuq.SDUc("dinuq/flavi/flavi_cds_no_stop.fasta", alldns)

In [40]:
blackfly_sdu=dinuq.SDUc("dinuq/flavi/blackfly_flavi.fasta", alldns)

In [41]:
sdu={**flavi_sdu, **blackfly_sdu}

In [42]:
dinuq.dict_to_tsv(sdu, "dinuq/flavi/dinuq_flavi_cds_sdu.tsv")

acc	CpCpos2	CpCbridge	CpGpos1	CpGpos2	CpGbridge	CpUpos1	CpUpos2	CpUbridge	CpApos2	CpAbridge	GpCpos2	GpCbridge	GpGpos2	GpGbridge	GpUpos2	GpUbridge	GpApos2	GpAbridge	UpCpos1	UpCpos2	UpCbridge	UpGpos2	UpGbridge	UpUpos1	UpUpos2	UpUbridge	UpApos2	UpAbridge	ApCpos2	ApCbridge	ApGpos1	ApGpos2	ApGbridge	ApUpos2	ApUbridge	ApApos2	ApAbridge
KP688058.1:89..10300	1.1701074104007743	1.1414994156410336	1.113822381889764	0.9123972644359333	0.9451084579481573	1.0182899048626048	1.0352980993853058	1.2006513071400342	1.0152728397369593	1.3425729673860867	1.0012362728173354	0.9357828782906025	0.7522866984507298	0.887068470730139	0.8222771139840628	1.2386793676578387	1.5301947581724036	1.0049028629693755	1.1028042764830648	1.2575229053038064	1.0243063699525679	1.3433329554186664	1.4269675566221034	0.977824786717184	1.0477911558421549	0.8576595838133089	0.36217885754024237	0.40436474186877763	1.264453453518211	1.0731457232743074	0.8130332274957636	0.9954875359833404	0.7863595069748761	0.7504273678212381	0.8

In [43]:
nt_gb=dinuq.ntcont("dinuq/flavi/flavi_cds_no_stop.fasta")
nt_blackfly=dinuq.ntcont("dinuq/flavi/blackfly_flavi.fasta")
nt={**nt_gb, **nt_blackfly}

In [44]:
# Specify the output TSV file path
output_file_path = "dinuq/flavi/flavi_ntcont.tsv"
write_nt_tsv(nt, output_file_path)

Data written to dinuq/flavi/flavi_ntcont.tsv
