In the section below, we pool the sequences against which our gene had a hit -- both from the flank-guided and flank-unguided approaches, we extract all open reading frames from them (same strand), and see if our query protein has a hit against any of those proteins.

In [None]:
#First, let's compile both sets of sequences together

mkdir ORFpresence
cd ORFpresence
#Make a list of query proteins for which we have either flank-guided or unguided DNA sequence hits
ls ../flank/*_blastn.seq.*.faa ../regular/*_blastn.seq.*.faa | rev | cut -f1 -d "/" | cut -f2- -d "_" |  rev | sort -u > ORFsearch_candidates.txt

for i in $(cat ORFsearch_candidates.txt)
do
cat ../flank/"$i"_blastn.seq.flank.faa ../regular/"$i"_blastn.seq.regular.faa >> "$i"_compiled_redundant.faa #Compile the sequences together
done

#Reduce in size by remove redundant sequences
#I.e., if identical sequences exist for the same taxon of relevance (genomes for intra-pangenome, species or genus names outside)——only retain one

for i in $(ls *_compiled_redundant.faa | rev | cut -f3- -d "_" | rev)
do
cat "$i"_compiled_redundant.faa | seqkit fx2tab | sed "s/Escherichia_/E/g" | grep "Ecoli@" | sed "s/_/@/1" | awk -F "@" '{print $2"@"$3"\t"$0}' | awk '{print $2"\t"$1"%"$NF}' | awk '!seen[$2]++' | sed "s/%/ /g" | awk '{print ">"$2"%"$1" "$3}' | sed "s/\t$//g" | sed "s/ /\n/g" > "$i".ORFsearch_targets.faa
cat "$i"_compiled_redundant.faa | seqkit fx2tab | sed "s/Escherichia_/E/g" | grep -v "Ecoli@" | awk -F "_" '{print $2,$0}' | awk '{print $2,$1"%"$3}' | awk '!seen[$2]++' | sed "s/%/ /g" | awk '{print ">"$2"%"$1" "$3}' | sed "s/\t$//g" | sed "s/ /\n/g" >> "$i".ORFsearch_targets.faa
done

#Now to extract ORFs from these sequences and search the query against them

for i in $(ls ../*.ORFsearch_targets.faa | cut -f2- -d "/" | rev | cut -f3- -d "." | rev)
do
#Print the sequence in three different frames (same strand) for downstream processing
seqkit fx2tab ../"$i".ORFsearch_targets.faa | sed "s/\t$//g" | sed "s/^/>/g" | sed "s/\t/\n/g" | awk '/^>/ {print} !/^>/ {gsub(/-/, ""); print}' | head -n-2 > "$i".ORFsearch_targets.frames.faa
seqkit fx2tab ../"$i".ORFsearch_targets.faa | sed "s/\t$//g" | sed "s/^/>/g" | sed "s/\t/\n/g" | awk '/^>/ {print} !/^>/ {gsub(/-/, ""); print}' | head -n-2 | awk '/^>/ {print} !/^>/ {print substr($0, 2)}' | sed "s/>/>frame1_/g" >> "$i".ORFsearch_targets.frames.faa
seqkit fx2tab ../"$i".ORFsearch_targets.faa | sed "s/\t$//g" | sed "s/^/>/g" | sed "s/\t/\n/g" | awk '/^>/ {print} !/^>/ {gsub(/-/, ""); print}' | head -n-2 | awk '/^>/ {print} !/^>/ {print substr($0, 3)}' | sed "s/>/>frame2_/g" >> "$i".ORFsearch_targets.frames.faa
#Now identify all open reading frames using this custom code
awk '/^>/ {print} !/^>/ {gsub(/.{3}/, "& "); print}' "$i".ORFsearch_targets.frames.faa | sed "s/ATG/M/g" | sed "s/TTG/L/g" | sed "s/GTG/V/g" | sed "s/TAA/Z/g" | sed "s/TGA/Z/g" | sed "s/TAG/Z/g" | sed "s/ //g" | awk '/^>/ {print} !/^>/ {print $0 "Z"}' | awk '/^>/ {header = $0} !/^>/ {
    while(match($0, /[MLV][^Z]*Z/)) {
        orf = substr($0, RSTART, RLENGTH)
        print header ": " orf
        $0 = substr($0, RSTART + RLENGTH)  # Remove found ORF and continue searching
    }
}' | cat -n | sed "s/^ *//g" | sed "s/\t>/_ORF_/g" | sed "s/^/>/g" | sed "s/: /\n/g" |
awk '/^>/ {print} !/^>/ {gsub("M", "ATG"); gsub("L", "TTG"); gsub("V", "GTG"); print}' > "$i".ORFsearch_targets.ORFs.faa
#Translate the ORFs into proteins
/stor/work/Ochman/hassan/tools/faTrans "$i".ORFsearch_targets.ORFs.faa "$i".ORFsearch_targets.ORFs.prot.faa #Extract all ORFs from the +strand of each genome
#Get the query sequence and translate it
echo $i | sed "s/$/(/g" | grep --no-group-separator -A1 -f - ../../all_genes_of_interest.CDS.faa > temp
/stor/work/Ochman/hassan/tools/faTrans -stop temp "$i".query.prot.faa #Query gene
#Finally, run the fasta36 (ssearch) search
/stor/work/Ochman/hassan/tools/fasta-36.3.8i/bin/ssearch36 -m 8 "$i".query.prot.faa "$i".ORFsearch_targets.ORFs.prot.faa > "$i".results.ssearch.tab
done

#Process the results to get a list of all genomes (or taxa for extra-pangenome) which has a matching ORF, either full or partial
for i in $(ls ../*.ORFsearch_targets.faa | cut -f2- -d "/" | rev | cut -f3- -d "." | rev)
do
length=$(seqkit fx2tab "$i".query.prot.faa | awk '{print length($2)}')
awk -F '\t' -v var="$length" '{cov = ($8 - $7 + 1) / var; print $0 "\t" cov}' "$i".results.ssearch.tab | awk -F '\t' '($NF>0.59999&&$11<0.001)' | cut -f2 | sed "s/frame1_//g" | sed "s/frame2_//g" | cut -f3- -d "_" | awk '($1~"@")' | cut -f1 -d "%" | sed "s/^/"$i"\t/g" >> ORF_presence.eval001.tsv
awk -F '\t' -v var="$length" '{cov = ($8 - $7 + 1) / var; print $0 "\t" cov}' "$i".results.ssearch.tab | awk -F '\t' '($NF>0.59999&&$11<0.001)' | cut -f2 | sed "s/frame1_//g" | sed "s/frame2_//g" | cut -f3- -d "_" | awk '($1!~"@")' | cut -f1 -d "%" | sed "s/^/"$i"\t/g" >> ORF_presence.eval001.tsv
awk -F '\t' -v var="$length" '{cov = ($8 - $7 + 1) / var; print $0 "\t" cov}' "$i".results.ssearch.tab | awk -F '\t' '($NF<0.6&&$11<0.001)' | cut -f2 | sed "s/frame1_//g" | sed "s/frame2_//g" | cut -f3- -d "_" | awk '($1~"@")' | cut -f1 -d "%" | sed "s/^/"$i"\t/g" >> ORF_partial.eval001.tsv
awk -F '\t' -v var="$length" '{cov = ($8 - $7 + 1) / var; print $0 "\t" cov}' "$i".results.ssearch.tab | awk -F '\t' '($NF<0.6&&$11<0.001)' | cut -f2 | sed "s/frame1_//g" | sed "s/frame2_//g" | cut -f3- -d "_" | awk '($1!~"@")' | cut -f1 -d "%" | sed "s/^/"$i"\t/g" >> ORF_partial.eval001.tsv
done

#These files will be required eventually for purposes of compiling results

I need to do one more search to rule out ORFans: Against nr.dmnd.

I don't trust this database——it contains contaminated or otherwise unreliable records. So I will only use this to rule out false positives only when other approaches have been implemented.

In [None]:
/stor/work/Ochman/hassan/E.coli_ORFan/E.coli_ORFan_pipeline_8-10/diamond blastp \
  -q step2_genusspecific_ORFan.replaced.faa \
  -d /stor/scratch/Ochman/hassan/nr/03272024_nr.dmnd \
  --taxonmap /stor/scratch/Ochman/hassan/nr/prot.accession2taxid.FULL \
  --taxonnames /stor/scratch/Ochman/hassan/nr/names.dmp \
  --taxonnodes /stor/scratch/Ochman/hassan/nr/nodes.dmp \
  --outfmt 6 qseqid sseqid pident nident qcovhsp length mismatch gapopen gaps qstart qend sstart send qlen slen evalue bitscore staxids sscinames \
  --ultra-sensitive \
  --out step2genus_vs_nr_withtax.tsv \
  -k 0 -b8 -c1

cat step2genus_vs_nr_withtax.tsv |
awk -F '\t' '($5>60&&$16<0.001)' |
awk -F '\t' '($19!~"Escherichia coli")' |
awk -F '\t' '($19!~"Shigella")' |
awk -F '\t' '($19!~"Escherichia sp.")' |
awk -F '\t' '($19!="N/A")' |
awk -F '\t' '($19!~"Enterobacteriaceae")' |
awk -F '\t' '($19!="Escherichia")' |
awk -F '\t' '($19!="Caudoviricetes sp.")' |
awk -F '\t' '($19!="unclassified Escherichia")' |
awk -F '\t' '($19!="Bacteriophage sp.")' |
awk -F '\t' '($19!="uncultured bacterium")' |
awk -F '\t' '($19!="Klebsiella pneumoniae IS22")' |
awk -F '\t' '($19!="Enterobacterales")' |
awk -F '\t' '($19!="Escherichia;uncultured bacterium")' |
awk -F '\t' '($19!="Enterobacter sp. EC-NT1")' |
awk -F '\t' '($19!="Escherichia;Caudoviricetes sp.")' |
awk -F '\t' '($19!="Salmonella sp. S13")' |
awk -F '\t' '($19!="Escherichia;Salmonella sp. S13")' |
awk -F '\t' '($19!~"phage")' |
cut -f1,19 | sort -u | cut -f1 | sort | uniq -c | awk '($1>1)' | rev |
cut -f1 -d " " | rev | cut -f1 -d "(" | grep -v -F -f - step2_genusspecific_ORFan.replaced.txt > step3_genusspecific_ORFan.replaced.txt

In [None]:
# @title
#sort -k1 ORF_presence.tsv -o ORF_presence.tsv
#for i in $(ls ../*.mafft_input.faa | cut -f2- -d "/" | rev | cut -f3- -d "." | rev)
#do
#echo -n $i " " && grep "$i" ../"$i".mafft_input.faa | tr -d ">"
#done | sort -k1 | join -1 1 -2 1 ORF_presence.tsv - | awk '{print $3"\t"$2}' > ORF_approach_presence.tsv #I guess this was required to fix differences in query name

#Exclude non-ORFans from OG list:
egrep -v "Ecoli|Ealbertii|Efergu|Eruysi|Ewhit|Emarmot" alignments/ORF_approach/ORF_approach_presence.tsv | grep -v "Enterobacteriaceae" | cut -f1 -d "(" | sort -u | sed "s/$/(/g" | grep -v -F -f - step1_genusspecific_ORFan.replaced.txt | cut -f1 -d "(" | sort -u > step2_genusspecific_ORFan.replaced.txt

#nr blast:

/stor/work/Ochman/hassan/E.coli_ORFan/E.coli_ORFan_pipeline_8-10/diamond blastp -q step1_genusspecific_ORFans.faa -d /stor/scratch/Ochman/hassan/nr/03272024_nr.dmnd --outfmt 6 qseqid sseqid pident nident qcovhsp length mismatch gapopen gaps qstart qend sstart send qlen slen evalue bitscore --ultra-sensitive --out all_proteins_vs_GBRS_annotated.tsv -k 0 -b8 -c1

for i in $(ls ../*.mafft_input.faa | cut -f2- -d "/" | rev | cut -f3- -d "." | rev)
do
#/stor/work/Ochman/hassan/tools/fasta-36.3.8i/bin/fasta36 -m 9c "$i".query.prot.faa "$i".mafft_input.searchtarget.prot.faa > "$i".results.2.tab
length=$(seqkit fx2tab "$i".query.prot.faa | awk '{print length($2)}')
awk '($22~"=")' "$i".results.2.tab | awk '{print $1,$6,$7,$11,$12,$19}' | awk -v var="$length" '{cov = ($5 - $4 - $NF + 1) / var; print $0" "cov}' | awk '($2<0.001&&$NF>0.6)' | sed "s/Escherichia_/E/g" | sed "s/flank_/flank\t/g" | sed "s/regular_/regular\t/g"
done

### Compiling all data together

In [None]:
#Files that reports presence:

cat /stor/work/Ochman/hassan/MS_Ecoli_ORFans_Ch3/rethinking_clustering/all_genes_of_interest.presence.tsv /stor/work/Ochman/hassan/MS_Ecoli_ORFans_Ch3/rethinking_clustering/alignments/ORF_approach/ORF_approach_presence.tsv | sed "s/Escherichia_/E/g" | sed "s/(+)//g" | sed "s/(-)//g" | sort -u > presence.tsv

#Files that report putative non-coding homolog:
grep "^>" alignments/*mafft_input.faa | cut -f2- -d "/" | sed "s/:/\t/1" | sed "s/_flank/\t/g" | sed "s/_regular/\t/g" | grep -v ")$" | cut -f1,2 | sed "s/.mafft_input.faa\t>/\t/g" | sort -u > putative_noncoding.tsv

#Files that report presence of flanks:
grep " " flank/flank_alternative_111025/*final.txt | cut -f3- -d "/" | sed "s/:/\t/1" | sed "s/_compiled_intervalinfo.taxa.final.txt//g" | sed "s/ /\t/g" | cut -f1,3 | sed "s/Escherichia_/E/g" > flank_presence.tsv

for i in $(cat step1_genusspecific_ORFan.replaced.txt | cut -f1 -d "("); do
  for j in $(cat alignments/all_lineages_ordered.txt); do

    if awk -v a="$i" -v b="$j" '$1==a && $2==b' presence.tsv | grep -q .; then
      echo -e "$i\t$j\tpresent"

    elif awk -v a="$i" -v b="$j" '$1==a && $2==b' putative_noncoding.tsv | grep -q .; then
      echo -e "$i\t$j\tnoncoding"

    elif awk -v a="$i" -v b="$j" '$1==a && $2==b' flank_presence.tsv | grep -q .; then
      echo -e "$i\t$j\tflanks"

    else
      echo -e "$i\t$j\tnontraceable"
    fi

  done
done

for i in $(cut -f1 -d "(" step1_genusspecific_ORFan.replaced.txt | head -1); do
  echo 'for j in $(cat alignments/all_lineages_ordered.txt); do'
  echo "  if awk -v a=\"$i\" -v b=\"\$j\" '\$1==a && \$2==b' presence.tsv | grep -q .; then"
  echo "    echo -e \"$i\t\$j\tpresent\""
  echo "  elif awk -v a=\"$i\" -v b=\"\$j\" '\$1==a && \$2==b' putative_noncoding.tsv | grep -q .; then"
  echo "    echo -e \"$i\t\$j\tnoncoding\""
  echo "  elif awk -v a=\"$i\" -v b=\"\$j\" '\$1==a && \$2==b' flank_presence.tsv | grep -q .; then"
  echo "    echo -e \"$i\t\$j\tflanks\""
  echo "  else"
  echo "    echo -e \"$i\t\$j\tnontraceable\""
  echo "  fi"
  echo 'done >> \"$i\".matrix.input.tsv'
done

# **Compilation**

First, compile all genomes where the gene is present:

In [None]:
#From protein blasts, pangenome:
sort -k2 all_genes_of_interest.pangenome.protein.presenceabsence.tsv | join -1 2 -2 2 - /stor/scratch/Ochman/hassan/100724_Complete_Genomes/Ecoli_pangenome_genome_protein_taxa.tsv | awk '{OFS="\t"}{print $2,$NF,$3}' | sort -u > presence.tsv
sort -k2 all_genes_of_interest.pangenome.contig.presenceabsence.tsv | join -1 2 -2 2 - /stor/scratch/Ochman/hassan/100724_Complete_Genomes/Ecoli_pangenome_genome_contig_taxa.tsv | awk '{OFS="\t"}{print $2,$NF,$3}' | sort -u >> presence.tsv

#From protein blasts, non-pangenome:
cat /stor/work/Ochman/hassan/MS_Ecoli_ORFans_Ch3/rethinking_clustering/all_genes_of_interest.presence.tsv | awk '($2!~"@")' | sort -u | sed "s/Escherichia_/E/g" >> presence.tsv

#From ORF search:
awk -F '\t' '($2~"@")' /stor/work/Ochman/hassan/MS_Ecoli_ORFans_Ch3/rethinking_clustering/alignments/ORF_approach_111725/ORF_presence.eval001.tsv | sed -E 's/(Ecoli@[0-9]+)_/\1\t/g' | awk 'NR==FNR{map[$1]=$1;next}{for(k in map) if(index(k,$3)){ $3=map[k];break } print}' 450_lineage_designations.tsv - | sort -u >> presence.tsv
awk -F '\t' '($2!~"@")' /stor/work/Ochman/hassan/MS_Ecoli_ORFans_Ch3/rethinking_clustering/alignments/ORF_approach_111725/ORF_presence.eval001.tsv | sort -u >> presence.tsv

#Massage:
sed "s/(+)//g" presence.tsv | sed "s/(-)//g" | sort -u | sed "s/Escherichia_/E/g" | sed "s/ /\t/g" > temp && mv temp presence.tsv

NameError: name 'all_genes_of_interest' is not defined

Next, compile all gnomes where the gene is partially present/has a noncoding status:

In [None]:
#Every case where blastn following synteny guided or unguided status had a hit:
grep "^>" alignments/*_compiled_redundant.faa | cut -f2- -d "/" | sed "s/:/\t/1" | sed "s/>flank_//g" | sed "s/>regular_//g" | sed "s/Escherichia_/E/g" | grep "Ecoli@" | cut -f-3 -d "@" | sed "s/_compiled_redundant.faa//g" | sed -E 's/(Ecoli@[0-9]+)_/\1\t/g' >> putative_noncoding.tsv
grep "^>" alignments/*_compiled_redundant.faa | cut -f2- -d "/" | sed "s/:/\t/1" | sed "s/>flank_//g" | sed "s/>regular_//g" | sed "s/Escherichia_/E/g" | grep -v "Ecoli@" | sed "s/_compiled_redundant.faa//g" | awk '{ split($2, a, "_"); print $1, a[1] }' | cut -f2 | sort -u | sed "s/ /\t/g" >> putative_noncoding.tsv

#All the partial hits:
cat all_genes_of_interest.presencepartial.tsv | sort -u | sed "s/ /\t/g" >> putative_noncoding.tsv

#Massage:
sed "s/(+)//g" putative_noncoding.tsv | sed "s/(-)//g" | sort -u | sed "s/Escherichia_/E/g" | sed "s/ /\t/g" > temp && mv temp putative_noncoding.tsv

#Flanks:
grep " " flank/flank_alternative_111025/*final.txt | cut -f3- -d "/" | sed "s/:/\t/1" | sed "s/_compiled_intervalinfo.taxa.final.txt//g" | sed "s/ /\t/g" | awk -F '\t' '($3~"@")' | awk -F '\t' '{print $1,$3,$2}' | rev | cut -f2- -d "@" | rev | sed "s/ /\t/g" | sort -u > flank_presence.tsv
grep " " flank/flank_alternative_111025/*final.txt | cut -f3- -d "/" | sed "s/:/\t/1" | sed "s/_compiled_intervalinfo.taxa.final.txt//g" | sed "s/ /\t/g" | awk -F '\t' '($3!~"@")' | cut -f1,3 | sed "s/Escherichia_/E/g" >> flank_presence.tsv

PREPARE THE FIGURE

In [None]:
#To generate the figure, we only focus on a reduced set of genes that are species-specific.

#To this end, we collapse the gene families further using a 60/60 cutoff
#Let's do an all-vs-all blast, followed by Silix-based clustering:

/stor/work/Ochman/hassan/E.coli_ORFan/E.coli_ORFan_pipeline_8-10/diamond makedb --in step3_genusspecific_ORFan.replaced.faa --db step3_genusspecific_ORFan.replaced
/stor/work/Ochman/hassan/E.coli_ORFan/E.coli_ORFan_pipeline_8-10/diamond blastp -q step3_genusspecific_ORFan.replaced.faa -d step3_genusspecific_ORFan.replaced --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore --ultra-sensitive --out step3_genusspecific_ORFan.replaced.silixinput.tsv -k 0 -b8 -c1

#Silix clustering:
/stor/work/Ochman/hassan/tools/silix-1.3.0/src/silix -f cluster_ -i 0.6 -r 0.6 -q -2 -s 3 step3_genusspecific_ORFan.replaced.faa step3_genusspecific_ORFan.replaced.silixinput.tsv | sort -k2 > silix_output.step3.tsv

#Extract the longest gene from each cluster to be the representative:
seqkit fx2tab step3_genusspecific_ORFan.replaced.faa | awk '{print $1,length($2)}' | sort -k1 | join -1 1 -2 2 - silix_output.step3.tsv | sort -k3,3 -k2,2nr | awk '!seen[$3]++' | cut -f1 -d " " | grep --no-group-separator -A1 -F -f - step3_genusspecific_ORFan.replaced.faa > step3_genusspecific_ORFan.replaced.reduced.faa

#Only retain species-specific genes:
awk '($2!~"@")' presence.tsv | grep -F -f <(grep "^>" step3_genusspecific_ORFan.replaced.reduced.faa | tr -d ">" | cut -f1 -d "(") - | cut -f1 | sort -u | grep -v -F -f - step3_genusspecific_ORFan.replaced.reduced.faa  | grep "^>" | grep --no-group-separator -A1 -F -f - step3_genusspecific_ORFan.replaced.reduced.faa > step3_speciesspecific_ORFan.replaced.reduced.faa

for i in $(grep "^>" step3_speciesspecific_ORFan.replaced.reduced.faa | tr -d ">" | cut -f1 -d "("); do
  echo 'for j in $(cut -f1 -d " " ../450_lineage_designations.tsv); do'
  echo '  lineage=$(awk -v key="$j" '"'"'$1==key {print $2}'"'"' ../450_lineage_designations.tsv)'
  echo "  if awk -v a=\"$i\" -v b=\"\$j\" '\$1==a && \$3==b' presence.tsv | grep -q .; then"
  echo '    echo -e "$j\t$lineage\tpresent"'
  echo "  elif awk -v a=\"$i\" -v b=\"\$j\" '\$1==a && \$3==b' putative_noncoding.tsv | grep -q .; then"
  echo '    echo -e "$j\t$lineage\tnoncoding"'
  echo "  elif awk -v a=\"$i\" -v b=\"\$j\" '\$1==a && \$3==b' flank_presence.tsv | grep -q .; then"
  echo '    echo -e "$j\t$lineage\tflanks"'
  echo "  else"
  echo '    echo -e "$j\t$lineage\tabsent"'
  echo "  fi"
  echo "done >> \"${i}.matrix.input.tsv\""
done

cp ../species_specific_ORFans_deepdive/step1_lineage.tsv .

#CONTINUE FROM HERE:

sed -i "s/%/\t/g" *matrix.input.tsv

for i in $(ls *matrix.input.tsv | rev | cut -f4- -d "." | rev)
do
awk -F'\t' 'NR==FNR { order[$2]=$1; next } { key=($3 in order)?order[$3]:999; printf "%03d\t%s\n", key, $0 }' step1_lineage.tsv "$i".matrix.input.tsv > "$i".step2_prefixed.tsv
sort -k1,1n -k2 "$i".step2_prefixed.tsv | cut -f2- | cut -f3- | awk '{ if (!seen[$1]++) order[++n]=$1; vals[$1]=vals[$1]?vals[$1]","$2:$2 } END { for(i=1;i<=n;i++){k=order[i]; printf "%s(%s)%s", k, vals[k], (i<n?",":"\n") } }' | sed "s/^/"$i",/g" >> species_specific_distributions.allgenomes.transformed.csv
done

cat species_specific_distributions.allgenomes.transformed.csv | sed "s/,/\t/1" | sed "s/),/)\t/g" | sed "s/,/%/g" | sed "s/\t/,/g" |
awk -F ',' '
{
  printf "%s", $1
  for (i = 2; i <= NF; i++) {

    match($i, /^[0-9]+/)
    num = substr($i, RSTART, RLENGTH)

    start = index($i, "(")
    contents = substr($i, start + 1, length($i) - start - 1)

    # reset counters
    n_present = n_noncoding = n_flanks = 0

    # count occurrences
    n_present    = gsub(/present/,    "", contents)
    n_noncoding  = gsub(/noncoding/,  "", contents)
    n_flanks     = gsub(/flanks/,     "", contents)

    # special handling for num == 14
    if (num == "14") {

      # divide by 2 (rounding up)
      n_present   = int((n_present   + 1) / 2)
      n_noncoding = int((n_noncoding + 1) / 2)
      n_flanks    = int((n_flanks    + 1) / 2)

      # if total > 10, subtract 1 from noncoding
      if (n_present + n_noncoding + n_flanks > 10) {
        n_noncoding--
        if (n_noncoding < 0) n_noncoding = 0
      }
    }

    # print 3-value tuple
    printf ",%s(%d,%d,%d)", num, n_present, n_noncoding, n_flanks
  }
  print ""
}' > species_specific_distributions.allgenomes.transformed.elegantcode.1.csv

awk -F'[()]' '{sum=0; for(i=2;i<=NF;i+=2){split($i,f,","); sum+=f[1]} print $0 "\t" sum}' species_specific_distributions.allgenomes.transformed.elegantcode.1.csv | awk -F'[()]' '{sum=0; for(i=2;i<=NF;i+=2){split($i,f,","); sum+=f[2]} print $0 "\t" sum}' | awk '{print $0,$2+$3}' | sort -nrk2 > species_specific_distributions.allgenomes.transformed.elegantcode.2.geneprofilemapping.csv

cut -f2- -d "," species_specific_distributions.allgenomes.transformed.elegantcode.2.geneprofilemapping.csv | sort -u | sort -nrk2 | cut -f1 |
awk '{out=""; while(match($0, /\(([0-9]+),/, m)){ out = (out=="" ? m[1] : out","m[1]); $0 = substr($0, RSTART+RLENGTH) } print out }' > allgreencounts.csv

cut -f2- -d "," species_specific_distributions.allgenomes.transformed.elegantcode.2.geneprofilemapping.csv | sort -u | sort -nrk2 | cut -f1 |
awk '{out=""; while(match($0, /,([0-9]+),/, m)){ out=(out==""?m[1]:out","m[1]); $0=substr($0,RSTART+RLENGTH)} print out }' > allredcounts.csv

cut -f2- -d "," species_specific_distributions.allgenomes.transformed.elegantcode.2.geneprofilemapping.csv | sort -u | sort -nrk2 | cut -f1 |
awk '{out=""; while(match($0, /,([0-9]+)\)/, m)){ out=(out==""?m[1]:out","m[1]); $0=substr($0,RSTART+RLENGTH)} print out }' > allyellowcounts.csv