Skip to content

Commit

Permalink
Merge pull request #392 from Gaius-Augustus/clamsa_OE
Browse files Browse the repository at this point in the history
Clamsa oe
  • Loading branch information
MarioStanke committed Jun 23, 2023
2 parents eac2a61 + fec66d3 commit 8c92452
Show file tree
Hide file tree
Showing 23 changed files with 1,706 additions and 627 deletions.
1 change: 1 addition & 0 deletions config/cgp/log_reg_parameters_default.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
/CompPred/exon_score13 -2.9764014 # conservation * diversity
/CompPred/exon_score14 -1.4184841 # omega * diversity
/CompPred/exon_score15 5.0224288 # number of species involved in this ortho exon divided by clade size
/CompPred/exon_score17 0 # ClamsaScore

# intron scores

Expand Down
62 changes: 62 additions & 0 deletions config/parameters/aug_cmdln_parameters.json
Original file line number Diff line number Diff line change
Expand Up @@ -1901,5 +1901,67 @@
"usage": "--UTR=true/false",
"default_value": "False",
"description": "Predict the untranslated regions in addition to the coding sequence. This works only for a subset of species for which an UTR model was trained."
},
{
"name": "/CompPred/clamsa",
"development": true,
"type": "bool",
"usage": "--/CompPred/clamsa=true/false",
"description": "Whether ClaMSA is used to comparatively score HECTs rather than dN/dS.",
"default_value": "3",
"exclude_apps": [
"etraining"
]
},
{
"name": "/CompPred/clamsa_numModels",
"development": true,
"type": "int",
"usage": "--/CompPred/clamsa_numModels=k",
"default_value": "3",
"description": "The number of codon rate matrices for ClaMSA.",
"exclude_apps": [
"etraining"
]
},
{
"name": "/CompPred/clamsa_Q",
"development": true,
"type": "string",
"usage": "--/CompPred/clamsa_Q=rates-Q.txt",
"description": "The path to the file that includes the rate matrices for ClaMSA.",
"exclude_apps": [
"etraining"
]
},
{
"name": "/CompPred/clamsa_pi",
"development": true,
"type": "string",
"usage": "--/CompPred/clamsa_pi=rates-pi.txt",
"description": "The path to the file that includes the codon equilibrium distribution for ClaMSA.",
"exclude_apps": [
"etraining"
]
},
{
"name": "/CompPred/clamsa_Theta",
"development": true,
"type": "string",
"usage": "--/CompPred/clamsa_Theta=Theta.txt",
"description": "The path to the file that includes the logistic regression weight matrix for ClaMSA.",
"exclude_apps": [
"etraining"
]
},
{
"name": "/CompPred/clamsa_biases",
"development": true,
"type": "string",
"usage": "--/CompPred/clamsa_biases=biases.txt",
"description": "The path to the file that includes the logistic regression biases for ClaMSA.",
"exclude_apps": [
"etraining"
]
}
]
12 changes: 6 additions & 6 deletions include/codonevo.hh
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@ public:
int &subst); // output variables
// store log likeliehoods for all omegas for one codon tuple

vector<double> loglikForCodonTuple(vector<string> &seqtuple, PhyloTree *ctree){
int i = 0;
return loglikForCodonTuple(seqtuple, ctree, NULL, i);
}
vector<double> loglikForCodonTuple(vector<string> &seqtuple, PhyloTree *ctree, PhyloTree *tree, int &subs);
vector<double> loglikForCodonTuple(vector<string> &seqtuple, PhyloTree *ctree){
int i = 0;
return loglikForCodonTuple(seqtuple, ctree, NULL, i);
}
vector<double> loglikForCodonTuple(vector<string> &seqtuple, PhyloTree *ctree, PhyloTree *tree, int &subs);
// for GRK proposal
double graphOmegaOnCodonAli(vector<string> &seqtuple, PhyloTree *tree, int refSpeciesIdx = 0);
double graphOmegaOnCodonAli(vector<string> &seqtuple, PhyloTree *tree, int refSpeciesIdx = 0);
/*
* add new branch length b
* this function is currently not needed, since the pruning algorithm does change the phylogenetic tree
Expand Down
101 changes: 101 additions & 0 deletions include/codonevodiscr.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* codonevo.hh
*
* License: Artistic License, see file LICENSE.TXT or
* https://opensource.org/licenses/artistic-license-1.0
*/

#ifndef _CODONEVODISCR_HH
#define _CODONEVODISCR_HH

#include "contTimeMC.hh"
#include "geneticcode.hh"

enum MatrixFormat {fmt_GSL, fmt_TXT};

//forward declarations
class PhyloTree;

/**
* @brief class CodonEvoDiscr
* @details Computes and stores 64x64 codon substitution probability matrices
* P(t,omega,kappa,pi) = exp(Q(omega,kappa,pi)*t)
* for a sample of values for t and omega (dN/dS, selection)
* and a fixed given value of kappa (transition/transversion ratio)
* and vector pi (codon usage).
* Matrices are precomputed and retrieval of P(t,omega) is then a constant time lookup
* as this is needed very often during estimation of omega for a given tree.
*
* @author Mario Stanke
* @author Stefanie König
* adapted by Giovanna Migliorelli
* */

// clamsa related code
class CodonEvoDiscr : public Evo {
public:
CodonEvoDiscr() : Evo(64), M(0), Theta(NULL), bias(NULL), w(NULL) {};
~CodonEvoDiscr();

/*
* in the discriminative settings, k stands for the number of models matrices were computed on the base of
*/
void setM();
int getM(){ return M;}
void writeRateMatrices(string filename); // serialize to file
void readMatrices(MatrixFormat fmt = fmt_TXT);

void computeLogPmatrices(); // precomputes and stores the array of matrices

/*
* Returns a pointer to the 64x64 substitution probability matrix
* for which t and omega come closest to scored values.
*/
gsl_matrix *getSubMatrixLogP(double omega, double t);

/*
* Computes the substitution probability
* P( X(t)=to | X(0)=from, omega), where 1 <= from,to <= 64 are codons
* If you want to call this for many values of 'from' and 'to', use rather getSubMatrixP above.
*/
double subLogProb(int from, int to, double omega, double t){
gsl_matrix *P = getSubMatrixLogP(omega, t);
return gsl_matrix_get(P, from, to);
}
/*
* log likelihood of exon candidate pair e1, e1 (required: equal number of codons, gapless)
* Used for testing. The pruning algorithm requires tuples of codons instead.
*/
double logLik(const char *e1, const char *e2, int u, double t); // output variables

// store log likeliehoods for all omegas for one codon tuple
vector<double> loglikForCodonTuple(vector<string> &seqtuple, PhyloTree *ctree){
int dummysubs = 0;
return loglikForCodonTuple(seqtuple, ctree, NULL, dummysubs);
}

/* loglikForCodonTuple
* @param[in] seqtuple input codon alignment with as many rows as ctree has leaves
* @param[in] ctree codon scaled tree
* @param[in] tree optionally a second tree for running the Fitch alg
* @return the a vector of k log likelihoods
*/
vector<double> loglikForCodonTuple(vector<string> &seqtuple, PhyloTree *ctree, PhyloTree *tree, int &subs);
/*
* add new branch length b
* this function is currently not needed, since the pruning algorithm does change the phylogenetic tree
*/
void addBranchLength(double b){}

void produceComb(vector<vector<int> >& config){}
double getProb(const vector<double> &lls);

private:
int M; // number of rate matrices for which P's are stored
vector<double*> piArr; // differently from the case of omega here each model M has its own pi
gsl_matrix *Theta; // M x 2 matrix from ClaMSA logreg layer
double *bias; // vector of 2 biases from ClaMSA logreg layer
double *w; // weight vector summarizing equivalently Theta and bias
};

#endif // _CODONEVODISCR_HH
28 changes: 13 additions & 15 deletions include/compgenepred.hh
Original file line number Diff line number Diff line change
Expand Up @@ -38,24 +38,22 @@ bool shiftGFF(string filename);
* @author Stefanie Koenig
*/
class CompGenePred {
public:
CompGenePred();
~CompGenePred() { delete rsa;}
public:
CompGenePred();
~CompGenePred() { delete rsa;}

void start();
void runPredictionOrTest();
void start();
void runPredictionOrTest();

#ifdef TESTING
// helpers for testing
void postprocTest();
bool readInterval(string filename, list<tuple<string,int,int> >& grlist);
#endif

RandSeqAccess *rsa;
PhyloTree tree;


#ifdef TESTING
// helpers for testing
void postprocTest();
bool readInterval(string filename, list<tuple<string,int,int> >& grlist);
#endif

RandSeqAccess *rsa;
PhyloTree tree;
void test_2(double ctree_scaling_factor);
};

#endif // _COMPGENEPRED_HH
Expand Down
89 changes: 60 additions & 29 deletions include/geneMSA.hh
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,6 @@
class OrthoGraph;
string printRFC(vector<int>);

/* store cumulative sums of log likelihoods for each omega (optional: cumulative sum of number of substitutions)
* we create an object of cumValues for each bit_vector and reading frame combination
*/
struct cumValues{
vector<double> logliks;
int numSubs;
int id;
cumValues(int i, int s=-1):numSubs(s), id(i){};
void addLogliks(vector<double>* ll){
if(logliks.size() == 0){
logliks.resize(ll->size(),0.0);
if(logliks.size() == 0){
cerr<<"logliks still empty!"<<endl;
}
}
for(int u = 0; u < ll->size(); u++){
logliks[u] += (*ll)[u];
}
}
void addNumSubs(int subs){
numSubs += subs;
}
};

/**
* @brief multiple sequence alignment of genomes for comparative gene prediction
*
Expand Down Expand Up @@ -107,6 +83,15 @@ public:
void printOrthoExons(list<OrthoExon> &orthoExonsList);
void computeOmegas(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, PhyloTree *ctree);
void computeOmegasEff(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, PhyloTree *ctree, ofstream *codonAli);
/**
* computeClamsaEff
* @param[in] orthoExonsList all orthoExons in gene range
* @param[in] seqRange the sequences that were aligned
* @param[in] ctree tree scaled for units of codon substitutions
* @see computeOmegasEff
*/
void computeClamsaEff(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, PhyloTree *ctree, ofstream *codonAli);
void computeClamsa(list<OrthoExon> &orthoExonsList, vector<AnnoSequence*> const &seqRanges, PhyloTree *ctree, ofstream *codonAli);
vector<string> pruneToBV(vector<string> *cs, bit_vector bv); // prune codon strings to bit_vector
vector<int> pruneToBV(vector<int> *rfc, bit_vector bv); // prune RFC to bit_vector
double omegaForCodonTuple(vector<double> *loglik);
Expand All @@ -124,6 +109,7 @@ public:
// static functions
static void setTree(PhyloTree *t){tree = t;}
static void setCodonEvo(CodonEvo *c){ codonevo = c; }
static void setCodonEvoDiscr(CodonEvoDiscr *c){ codonevodiscr = c; }
static int numSpecies(){ return tree->numSpecies(); }
static void openOutputFiles(string outdir);
static void closeOutputFiles();
Expand All @@ -139,7 +125,7 @@ public:
// map that stored all codon combinations on which fitch and pruning algorithm already have been run (for calculation of omega and number of substitutions)
static map<vector<string>, pair<vector<double>, int> > computedCumValues;

void printSingleOrthoExon(OrthoExon &oe, bool files = true);
void printSingleOrthoExon(OrthoExon const &oe, bool files = true);
void collect_features(int species, list<OrthoExon> *hects, SpeciesGraph *speciesgraph);

/**
Expand All @@ -165,20 +151,65 @@ public:
*/
StringAlignment getMsa(OrthoExon const &oe, vector<AnnoSequence*> const &seqRanges, size_t flanking = 0);
private:
vector<string> getCodonAlignment(OrthoExon const &oe, vector<AnnoSequence*> const &seqRanges, const vector<vector<fragment>::const_iterator > &froms, map<unsigned, vector<int> > *alignedCodons = NULL, bool generateString=true, vector<vector<int> > *posStoredCodons = NULL, ofstream *codonAli = NULL);
/**
* getCodonAlignment
* Retreive from a genomic (partial) MSA the columns of aligned
* codons. Codons are specified by the exon candidates and if the
* thee bases of a codon are mapped to the same the alignment
* columns as the based of a codon in another species, the codons
* are aligned.
*
* @return a vector of alignment row strings
* @param[in] oe the OrthoExon whose codon alignment is sought
* @param[in] seqRanges contains the alignment of the larger region tuple
* @param[in] froms iterators to the alignment fragments, the
* aligned regions must be to their rights
* @param[out,in] alignedCodons: key identifies triples of
* alignment columsn, values are genome positions of the first
* codon bases. This can be used to manage when aligned codons are
* shared between different OrthoExons.
* @see The .cc file contains a definition by example.
*/
vector<string> getCodonAlignment(OrthoExon const &oe, vector<AnnoSequence*> const &seqRanges,
const vector<vector<fragment>::const_iterator > &froms,
map<unsigned, vector<int> > *alignedCodons = NULL,
bool generateString = true,
ofstream *codonAli = NULL);
/**
* getCodonAlignment2
* Retreive from a genomic (partial) MSA the columns of aligned
* codons. Codons are specified by the exon candidates and if the
* thee bases of a codon are mapped to the same the alignment
* columns as the based of a codon in another species, the codons
* are aligned.
*
* @return a vector of alignment row strings
* @param[in] oe the OrthoExon whose codon alignment is sought
* @param[in] seqRanges contains the alignment of the larger region tuple
* @param[in] froms iterators to the alignment fragments, the
* aligned regions must be to their rights
* @param[out,in] alignedCodons: key identifies triples of
* alignment columsn, values are genome positions of the first
* codon bases. This can be used to manage when aligned codons are
* shared between different OrthoExons.
* @see The .cc file contains a definition by example.
*/
vector<string> getCodonAlignment2(OrthoExon const &oe, vector<AnnoSequence*> const &seqRanges,
const vector<vector<fragment>::const_iterator > &froms);
void cutIncompleteCodons(OrthoExon &oe);
cumValues* findCumValues(bit_vector bv, vector<int> rfc);
static PhyloTree *tree;
LocusTree *ltree;
static CodonEvo *codonevo;
static CodonEvoDiscr *codonevodiscr;
vector<int> starts, ends; // gene ranges for each species
vector<int> offsets; // this many bases are upstream from the region
RandSeqAccess *rsa;
Alignment* alignment; // alignment of regions which possibly belong to a gene
vector< list<ExonCandidate*>* > exoncands; // exon candidates found in the different species in a gene segment
//list<OrthoExon> orthoExonsList; // Steffi: due to multiple copying, I remove this as attribute of any class. Instead it can be passed from compgenepred.cc by reference, whenever necessary.
unordered_map<bit_vector, vector<pair<vector<int>, cumValues> >, boost::hash<bit_vector> > cumOmega; // stores cumulative omega values for every reading frame combination and every bitvector that exist
map<bit_vector, map<vector<int>, vector<double> > > codonOmega;
// store cumulative omega values for every reading frame combination and every bitvector that exist
unordered_map<bit_vector, vector<pair<vector<int>, cumValues> >, boost::hash<bit_vector> > cumOmega;
map<bit_vector, map<vector<int>, vector<double> > > codonOmega; // deprecated

#ifdef TESTING
/*
Expand Down
Loading

0 comments on commit 8c92452

Please sign in to comment.