<a href="https://colab.research.google.com/github/iceQHdrop/bioinformatic_data_mining/blob/main/WAM_for_function_finding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WAM_for_function_finding

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! cp -r drive/MyDrive/Colab_Notebooks/bioinformatic_data_mining/dataset ./

## 导入所需库

In [4]:
import pandas as pd 
import numpy as np
import xarray as xr    # 处理三维数据
from tqdm import tqdm    # 进度条
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

## 设定路径，读取文件名

In [5]:
train_path = 'dataset/TrainingSet'
test_path = 'dataset/TestingSet'    
train_files = os.listdir(train_path)
test_file = os.listdir(test_path)

## WAM

### 处理训练数据

#### 碱基计数

In [42]:
site_base_count = pd.DataFrame(np.zeros((9, 4)),    # 记录位点周围碱基数
                               columns = ['a', 't', 'c', 'g'],
                               index = [i-3 for i in range(9)])

site_forward_count = xr.DataArray(np.zeros((9, 4, 4)),    # 记录位点周围当前一个碱基确定时碱基数
                                  dims = ['position', 'forward', 'self'],
                                  coords = [[i-3 for i in range(9)],
                        ['a', 't', 'c', 'g'],
                        ['a', 't', 'c', 'g']])

background_forward_count = pd.DataFrame(np.zeros((4, 4)),    # 记录背景当前一个碱基确定时碱基数
                                     columns = ['a', 't', 'c', 'g'],
                                     index = ['a', 't', 'c', 'g'])
site_seqs = []
whole_seq = ''

print('Loading data and counting base...')

for train_file in tqdm(train_files):
    with open(train_path + '/' + train_file, 'r') as f:
        text = f.readlines()
        site_positions = re.findall('(\d+)(?=,)', text[1])    # 提取位置
        seq = ''.join(text[2:]).replace('\n', '').lower()    # 仅保留小写序列

        for position in site_positions:
            site_seqs.append(seq[int(position) - 4:int(position) + 5])    # 提取位点序列

            for i in range(9):    # 统计位点周围9个位置的碱基
                if int(position) - 1 + (i - 3) > 0:    # 排除在基因前三个碱基的位点
                    base = seq[int(position) - 1 + (i - 3)]    
                    site_base_count.loc[i-3, base] += 1

            for i in range(8):    # 统计位点周围9个位置的碱基，当前一个碱基确定时的数目
                if int(position) - 1 + (i - 3) > 0:
                    base = seq[int(position) - 1 + (i - 2)]   
                    forward_base = seq[int(position) - 1 + (i - 3)]
                    site_forward_count.loc[i-2, forward_base, base] += 1
            
        # for i in range(len(seq)-1):    # 统计背景碱基当前一个碱基确定时的数目 
        #     base = seq[i+1]
        #     forward_base = seq[i]
        #     try:    # 越过'n'
        #         background_forward_count.loc[forward_base, base] += 1
        #     except:
        #         continue

        whole_seq += seq

Loading data and counting base...


100%|██████████| 462/462 [00:24<00:00, 18.80it/s]


#### 构建前景概率矩阵

构建不考虑前一个碱基的概率矩阵

In [43]:
matrix_pos = site_base_count/len(site_seqs)
matrix_pos

Unnamed: 0,a,t,c,g
-3,0.270055,0.183536,0.305334,0.241075
-2,0.328433,0.119278,0.363713,0.188576
-1,0.585468,0.143217,0.137757,0.133557
0,0.094078,0.083158,0.034019,0.788744
1,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0
3,0.49013,0.02436,0.027299,0.458211
4,0.713566,0.091558,0.077278,0.117598
5,0.065099,0.047039,0.049559,0.838303


构建考虑前一个碱基的概率矩阵

In [44]:
matrix_forward_pos = site_forward_count.copy()

for base in ['a', 't', 'c', 'g']:
    matrix_forward_pos.loc[:, base] = matrix_forward_pos.loc[:, base]/matrix_forward_pos.loc[:, base].sum(axis = 1)

matrix_forward_pos

#### 构建背景概率矩阵

构建不考虑前一个碱基的概率矩阵

In [14]:
whole_seq.replace('n', '')
p_neg = [whole_seq.count(base)/len(whole_seq) for base in ['a', 't', 'c', 'g']]  
matrix_neg = pd.DataFrame(np.tile(p_neg, (9, 1)),
                          columns = ['a', 't', 'c', 'g'],
                          index = [i-3 for i in range(9)])
matrix_neg

Unnamed: 0,a,t,c,g
-3,0.260085,0.271169,0.23217,0.236428
-2,0.260085,0.271169,0.23217,0.236428
-1,0.260085,0.271169,0.23217,0.236428
0,0.260085,0.271169,0.23217,0.236428
1,0.260085,0.271169,0.23217,0.236428
2,0.260085,0.271169,0.23217,0.236428
3,0.260085,0.271169,0.23217,0.236428
4,0.260085,0.271169,0.23217,0.236428
5,0.260085,0.271169,0.23217,0.236428


构建考虑前一个碱基的概率矩阵

In [11]:
matrix_forward_neg = background_forward_count.apply(lambda x: x/x.sum(), axis = 1)
matrix_forward_neg

Unnamed: 0,a,t,c,g
a,0.296923,0.234069,0.187147,0.281861
t,0.186917,0.308036,0.22369,0.281356
c,0.307009,0.32282,0.294622,0.075548
g,0.257592,0.219191,0.230213,0.293004


### 定义$S(X)$

In [53]:
def WAMDecisionFunction(seq):
    score = np.log(matrix_pos.loc[-3, seq[0]]/matrix_neg.loc[-3, seq[0]])

    for i in range(8):
        score += np.log(matrix_forward_pos.loc[i-2, seq[i], seq[i+1]]/matrix_forward_neg.loc[seq[i], seq[i+1]])

    return float(score)

### 寻找阈值 

计算每一条序列得分

In [54]:
print('Calculating scores for each sequence...')

score_list = [WAMDecisionFunction(seq) for seq in tqdm(site_seqs)]

Calculating scores for each sequence...


100%|██████████| 2381/2381 [00:16<00:00, 140.83it/s]


生成阈值序列

In [66]:
threshold = np.linspace(np.min(score_list), np.max(score_list), 100)

array([ True,  True,  True, ...,  True,  True,  True])

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]