In [2]:
# Perform the initialization and imports
import sys
import pickle
import re
import os
import csv
import argparse
import math
import pprint

from string import ascii_lowercase
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import issparse

from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.Emboss.Applications import NeedleallCommandline

# Demand Python 3.
if sys.version_info[0] < 3:
    print("Python 3 is required, but you are using Python %i.%i.%i") % (
        sys.version_info[0], sys.version_info[1], sys.version_info[2])
    sys.exit(1)

In [3]:
# Retrieve the specific functions from ind and proteins.py
indels_path="/home/maya/InDelScanner"  # /PATH/TO/InDelScanner
if indels_path not in sys.path:
    sys.path.append(indels_path)
#from indels.ind import trim_read, findEnds, endMatch, findGap, gapAlign

from ipynb.fs.defs.Library_diversity import convert_variant_to_dict

os.chdir("/mnt/c/Users/Maya/Dropbox/mek_results")

with open('Remkes_protein.p', 'rb') as f:
    all_ref = pickle.load(f)
with open('Remkes_protein_low.p', 'rb') as f:
    low = pickle.load(f)

all_ref['mek']['low-v2'] = low['mek']['low-v2']

mek = {}
for fraction in ['high', 'med']:
    mek[fraction] = Counter(all_ref['mek'][fraction])
mek['low-t'] = Counter(all_ref['mek']['low']) + Counter(all_ref['mek']['low-v2'])

In [4]:
df_all = pd.DataFrame.from_dict(mek).fillna(0).sort_values(by=['high', 'med', 'low-t'], ascending=False).astype(int)
df_50p = df_all.loc[(df_all['high'] >= 50) & ((df_all['high']+df_all['med']) > 2*df_all['low-t'])]
df_20to50 = df_all.loc[(df_all['high'].isin(range(10,50))) & 
                       (df_all['high'] > df_all['med']) & 
                       ((df_all['high']+df_all['med']) > 5*df_all['low-t']) ]

df_pos = df_50p.append(df_20to50)

active_ls = df_pos.index.tolist()
active = {short : convert_variant_to_dict(short) for short in active_ls}

valid_pos = ['6', '7a', '8a', '9', '11', '13']

data = {}
for short, m_to_pos in active.items():
    if len(m_to_pos) != len(valid_pos):
        continue
    else:
        data[short] = [m_to_pos[i] for i in valid_pos]

factors = pd.DataFrame.from_dict(data, orient='index', columns=valid_pos).reset_index(drop=False)

In [5]:
factors

Unnamed: 0,index,6,7a,8a,9,11,13
0,6L/7aI/8aA/9L/11F/13M,L,I,A,L,F,M
1,6F/7aP/9W/11L/13M,F,P,Δ,W,L,M
2,6L/7aF/9L/11I/13I,L,F,Δ,L,I,I
3,6A/7aI/8aA/9L/11L/13I,A,I,A,L,L,I
4,6W/7aI/9F/11L/13V,W,I,Δ,F,L,V
5,6A/7aP/8aA/9L/11V/13W,A,P,A,L,V,W
6,6L/7aI/8aA/9M/11W/13W,L,I,A,M,W,W
7,6V/7aP/8aA/9F/11F/13M,V,P,A,F,F,M
8,6A/7aK/9L/11L/13W,A,K,Δ,L,L,W
9,6A/7aI/8aA/9L/11V/13M,A,I,A,L,V,M


In [9]:
data_x = factors[:100]

In [None]:
def cross_variant_blosum_matrix(data_x, data_y=None, weight=None, cat_features=None):  
    # data_x is going to be a dataframe (factors_df) with each factors a separate column
    # df content is strings (categorical factors)
    
    # function checks
    X = data_x
    if data_y is None: Y = data_x 
    else: Y = data_y 
    if not isinstance(X, np.ndarray): 
        if not np.array_equal(X.columns, Y.columns): raise TypeError("X and Y must have same columns!")   
    else: 
         if not X.shape[1] == Y.shape[1]: raise TypeError("X and Y must have same y-dim!")  
                
    if issparse(X) or issparse(Y): raise TypeError("Sparse matrices are not supported!")        
            
    x_n_rows, x_n_cols = X.shape
    y_n_rows, y_n_cols = Y.shape 
    
    # determine which features are categorical
    # get a boolean array for columns
    if cat_features is None:
        if not isinstance(X, np.ndarray): 
            is_number = np.vectorize(lambda x: not np.issubdtype(x, np.number))
            cat_features = is_number(X.dtypes)    
        else:
            cat_features = np.zeros(x_n_cols, dtype=bool)
            for col in range(x_n_cols):
                if not np.issubdtype(type(X[0, col]), np.number):
                    cat_features[col]=True
    else:          
        cat_features = np.array(cat_features)
    
    print(cat_features)
    #
    if not isinstance(X, np.ndarray): X = np.asarray(X)
    if not isinstance(Y, np.ndarray): Y = np.asarray(Y)
    
    # concatenate, that is attach one after the other (keep same columns)
    Z = np.concatenate((X,Y))
    
    # the indices run over different rows of the same array - slicing
    x_index = range(0,x_n_rows)
    y_index = range(x_n_rows,x_n_rows+y_n_rows)
    
    # the next part is purely for numerical features - ignore for proteins
    Z_num = Z[:,np.logical_not(cat_features)] # may be empty
    
    num_cols = Z_num.shape[1]
    num_ranges = np.zeros(num_cols)
    num_max = np.zeros(num_cols)
    
    
    for col in range(num_cols):
        col_array = Z_num[:, col].astype(np.float32) 
        max = np.nanmax(col_array)
        min = np.nanmin(col_array)
     
        if np.isnan(max):
            max = 0.0
        if np.isnan(min):
            min = 0.0
        num_max[col] = max
        num_ranges[col] = (1 - min / max) if (max != 0) else 0.0

    # This is to normalize the numeric values between 0 and 1.
    Z_num = np.divide(Z_num ,num_max,out=np.zeros_like(Z_num), where=num_max!=0)
    
    # now take just the categorical features
    Z_cat = Z[:,cat_features]
    # the categorical features can be weighed against each other
    if weight is None:
        weight = np.ones(Z.shape[1])
        
    #print(weight)    
    
    weight_cat=weight[cat_features]
    weight_num=weight[np.logical_not(cat_features)]   
    
    # create the output array for everything
    out = np.zeros((x_n_rows, y_n_rows), dtype=np.float32)
        
    weight_sum = weight.sum()
    
    X_cat = Z_cat[x_index,]
    X_num = Z_num[x_index,]
    Y_cat = Z_cat[y_index,]
    Y_num = Z_num[y_index,]
    
   # print(X_cat,X_num,Y_cat,Y_num)
    
    # meat of the function: loop over rows in X
    for i in range(x_n_rows):
        # the matrix is symmetric
        j_start= i        
        if x_n_rows != y_n_rows:
            j_start = 0
        # call the main function
        res = blosum_get(X_cat[i,:], 
                          X_num[i,:],
                          Y_cat[j_start:y_n_rows,:],
                          Y_num[j_start:y_n_rows,:],
                          weight_cat,
                          weight_num,
                          weight_sum,
                          cat_features,
                          num_ranges,
                          num_max) 
        #print(res)
        out[i,j_start:]=res
        if x_n_rows == y_n_rows: out[i:,j_start]=res
        
    return out

def blosum_get(xi_cat,xi_num,xj_cat,xj_num,feature_weight_cat,
              feature_weight_num,feature_weight_sum,categorical_features,
              ranges_of_numeric,max_of_numeric ):
    
    # categorical columns
    sij_cat = np.where(xi_cat == xj_cat,np.zeros_like(xi_cat),np.ones_like(xi_cat))
    sum_cat = np.multiply(feature_weight_cat,sij_cat).sum(axis=1) 

    # numerical columns
    abs_delta=np.absolute(xi_num-xj_num)
    sij_num=np.divide(abs_delta, ranges_of_numeric, out=np.zeros_like(abs_delta), where=ranges_of_numeric!=0)

    sum_num = np.multiply(feature_weight_num,sij_num).sum(axis=1)
    sums= np.add(sum_cat,sum_num)
    sum_sij = np.divide(sums,feature_weight_sum)
    
    return sum_sij


In [28]:
# create the output array for everything
out = np.zeros((x_n_rows, y_n_rows), dtype=np.float32)

X_cat = Z_cat[x_index,]
X_num = Z_num[x_index,]
Y_cat = Z_cat[y_index,]
Y_num = Z_num[y_index,]

In [29]:
X_cat

array([['6L/7aI/8aA/9L/11F/13M', 'L', 'I', 'A', 'L', 'F', 'M'],
       ['6F/7aP/9W/11L/13M', 'F', 'P', 'Δ', 'W', 'L', 'M'],
       ['6L/7aF/9L/11I/13I', 'L', 'F', 'Δ', 'L', 'I', 'I'],
       ['6A/7aI/8aA/9L/11L/13I', 'A', 'I', 'A', 'L', 'L', 'I'],
       ['6W/7aI/9F/11L/13V', 'W', 'I', 'Δ', 'F', 'L', 'V'],
       ['6A/7aP/8aA/9L/11V/13W', 'A', 'P', 'A', 'L', 'V', 'W'],
       ['6L/7aI/8aA/9M/11W/13W', 'L', 'I', 'A', 'M', 'W', 'W'],
       ['6V/7aP/8aA/9F/11F/13M', 'V', 'P', 'A', 'F', 'F', 'M'],
       ['6A/7aK/9L/11L/13W', 'A', 'K', 'Δ', 'L', 'L', 'W'],
       ['6A/7aI/8aA/9L/11V/13M', 'A', 'I', 'A', 'L', 'V', 'M'],
       ['6D/7aI/8aA/9L/11I/13M', 'D', 'I', 'A', 'L', 'I', 'M'],
       ['6M/7aP/9L/11L/13V', 'M', 'P', 'Δ', 'L', 'L', 'V'],
       ['6L/7aI/9K/11L/13M', 'L', 'I', 'Δ', 'K', 'L', 'M'],
       ['6L/7aL/9K/11L/13A', 'L', 'L', 'Δ', 'K', 'L', 'A'],
       ['6V/7aI/9D/11L/13W', 'V', 'I', 'Δ', 'D', 'L', 'W'],
       ['6L/7aI/8aA/9Y/11M/13V', 'L', 'I', 'A', 'Y', 'M', 'V'],
       [