<a href="https://colab.research.google.com/github/MarySelifanova/Aminoacid-translocations/blob/main/130321_00_11_closer_look.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Add dictionaries to 00 and 11

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **Install and import modules**

In [None]:
!pip install bio
!pip install XlsxWriter
from Bio import SeqIO
from itertools import combinations
import numpy as np
import re
import pandas as pd
import xlsxwriter

**Load dataframes from tables**

In [31]:
xls = pd.ExcelFile('/content/drive/MyDrive/Translocations/H1N1_H3N2.1557.AA.aligned.1str_tuned.xlsx')
df1 = pd.read_excel(xls, 'len=1')
df2 = pd.read_excel(xls, 'len=2')
df3 = pd.read_excel(xls, 'len=3')
df4 = pd.read_excel(xls, 'len=4')

In [32]:
df4

Unnamed: 0,kmer,pos1,pos2,corr,leap,00,01,10,11
0,NGKL,65,407,-0.992322,342,12,1552,1550,0
1,KLNR,409,525,-0.874857,116,207,1354,1553,0
2,VASS,135,551,-0.978399,416,34,1546,1534,0


In [33]:
def add_dictionaries(file, k, df):

  d00_list = []
  d11_list = []

  for index1, row1 in df.iterrows():

      d00 = {}
      d11 = {} 

      if int(row1['00']) != 0 or int(row1['11']) != 0:

        p1 = int(row1['pos1'])
        p2 = int(row1['pos2'])
        kmer = row1['kmer']

        with open(file, "rU") as handle:  
          for record in SeqIO.parse(handle, "fasta"): #iterating through sequences in the alignment file
            test_str = str(record.seq) #test_str – one sequence; res – list of kmers of particular length k

            pos1_kmer = test_str[p1-1:p1+k-1]
            pos2_kmer = test_str[p2-1:p2+k-1]
            pair = pos1_kmer + '_' + pos2_kmer
            
            if pos1_kmer != kmer and pos2_kmer != kmer:   
            
              if pair in d00.keys():
                d00[pair] +=1
              else:
                d00[pair] = 1    

            elif pos1_kmer == kmer and pos2_kmer == kmer:

              if pair in d11.keys():
                d11[pair] +=1
              else:
                d11[pair] = 1

      d00_list.append(d00)
      d11_list.append(d11)

  df['00_dict'] = d00_list
  df['11_dict'] = d11_list

In [40]:
file = '/content/drive/MyDrive/Translocations/H1N1_H3N2.1557.AA.aligned.1str.fasta'
k = 1
df = df1

add_dictionaries(file, k, df)



In [41]:
df1

Unnamed: 0,kmer,pos1,pos2,corr,leap,00,01,10,11,00_dict,11_dict
0,K,18,49,-0.978398,31,34,1551,1529,0,"{'-_R': 3, 'D_D': 8, 'R_D': 6, 'E_D': 10, 'N_D...",{}
1,K,18,67,-0.976941,49,27,1558,1520,9,"{'-_R': 4, 'R_E': 6, 'E_E': 10, 'N_E': 5, 'Q_E...",{'K_K': 9}
2,K,18,72,-0.972758,54,43,1542,1529,0,"{'-_R': 3, '-_G': 5, '-_E': 1, '-_N': 3, 'D_-'...",{}
3,K,18,103,-0.942600,85,92,1493,1529,0,"{'-_R': 17, '-_A': 15, '-_V': 14, '-_N': 9, '-...",{}
4,K,18,149,-0.977691,131,32,1553,1526,3,"{'-_Q': 1, 'D_T': 6, 'R_N': 6, 'E_N': 10, 'N_N...",{'K_K': 3}
...,...,...,...,...,...,...,...,...,...,...,...
131,Q,134,436,-0.996794,302,5,1560,1549,0,"{'H_E': 2, 'R_E': 1, 'X_E': 2}",{}
132,Q,134,463,-0.995514,329,7,1558,1549,0,"{'H_E': 2, 'R_E': 1, 'X_E': 2, 'L_X': 1, 'L_H'...",{}
133,Q,134,530,-0.991038,396,13,1552,1548,1,"{'H_K': 2, 'R_K': 1, 'X_K': 2, 'L_H': 7, 'L_K'...",{'Q_Q': 1}
134,Q,134,570,-0.994236,436,9,1556,1549,0,"{'H_S': 2, 'R_S': 1, 'X_S': 2, 'L_K': 2, 'L_L'...",{}


In [42]:
outfile = '/content/drive/MyDrive/Translocations/H1N1_H3N2.1557.AA.aligned.1str_dictionary.xlsx'
# Create a Pandas Excel writer using XlsxWriter as the engine
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

# Write each dataframe to a different worksheet
df1.to_excel(writer, sheet_name='len=1')
df2.to_excel(writer, sheet_name='len=2')
df3.to_excel(writer, sheet_name='len=3')
df4.to_excel(writer, sheet_name='len=4')

# Close the Pandas Excel writer and output the Excel file
writer.save()

# Add PDB distance