# Context dependence

## Libraries imports

In [17]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from Bio import SeqIO
from Bio.Seq import Seq
from Bio import AlignIO
from scipy.stats import entropy

## Data

In [5]:
SEQUENCES_FOLDER = "../Data/Protein sequences/"
DCA_FOLDER = "../Data/DCA models/"

## MSA class

In [24]:
class MSA:
    alphabet = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','-']

    def __init__(self, fasta_file, name=""):
        self.name = name
        self.MSA = self.fasta2matrix(fasta_file)
        self.CI_entropy = None
        
    def letter2number(self,letter):
        if(letter in self.alphabet):
            return self.alphabet.index(letter)
        return len(self.alphabet)-1
    
    def fasta2matrix(self,fasta_file):
        sequences_list = []
        for seq in SeqIO.parse(fasta_file, "fasta"):
            sequence_string = seq.seq
            sequence_array = []
            for letter in sequence_string:
                sequence_array.append(self.letter2number(letter))
            sequences_list.append(sequence_array)
        return np.array(sequences_list)

    def compute_CI_entropy(self):
        entropies = []
        for i in range(self.MSA.shape[1]):
            counts = np.bincount(self.MSA[:,i],minlength=21)
            freq_21 = counts/np.sum(counts)
            freq_20 = freq_21[:-1]
            entropies.append(entropy(freq_20/np.linalg.norm(freq_20), base=2))
        return entropies

## DCA class

In [None]:
class DCA:
    alphabet = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','-']
    
    def __init__(self, J, h, reference, protein_name=""):
        self.name = protein_name
        self.J = J # J'aurai sans doute à écrire une fonction pour lire le fichier où se trouve J
        self.h = h # J'aurai sans doute à écrire une fonction pour lire le fichier où se trouve h
        self.reference = self.sequence2array(reference)
        self.H_ref = self.H(reference)
        self.CD_entropy = None
    
    def letter2number(self,letter):
        if(letter in self.alphabet):
            return self.alphabet.index(letter)
        return len(self.alphabet)-1
    
    def sequence2array(self,sequence):
        sequence_array = []
        for letter in sequence.seq:
            sequence_array.append(self.letter2number(letter))
        return np.array(sequence_array)
    
    def H(self,sequence):
        if(type(sequence)==str):
            sequence_array = self.sequence2array(sequence)
        else:
            sequence_array = sequence
        n = len(sequence_array)
        v = np.zeros((n*20,1))
        for i in range(n):
            if(sequence_array[i]<20):
                v[int(20*(i-1)+sequence_array[i])]=1
        e = -0.5*np.matmul(v.transpose(),np.matmul(self.J,v))-np.matmul(v.transpose(),self.h) #VERIFIER le < ou != auprès de Giancarlo, facteur 0.5
        return float(np.exp(e))#VERIFIER SI C'EST BIEN TOUJOURS l'EXPONENTIELLE QUE L'ON VEUT
    
    def ∆H_sequence(self,sequence):
        return self.H(sequence)-self.H_ref
    
    def mutant(self,loci,residues):
        sequence = self.reference.copy()
        for i in range(len(loci)):
            sequence[loci[i]] = residues[i]
        return sequence
    
    def ∆H_mutant(self,loci,residues):
        sequence = self.single_mutant(loci,residues)
        return self.H(sequence)-self.H_ref
            
    def compute_CD_entropy(self):
        entropies = []
        for locus in range(len(self.reference)):
            proba_21 = []
            for residue in [i for i in range(21) if i!=self.reference[locus]]:
                proba_21.append(self.H(self.single_mutant(locus,residue)))
            proba_21 = proba_21/np.sum(proba_21)
            proba_20 = proba_20[:-1]
            entropies.append(entropy(proba_20/np.linalg.norm(proba_20), base=2))
        return entropies

## Questions à poser

- La bonne formule du DCA est-elle bien avec le i<j sur la somme (ou somme entière sur i!=j divisée par 2) ?
- Comment génère-t-on les single mutants, quelle loi de proba pour les mutations ? Equiprobables ? Atteignables par un SNP ?
