<a href="https://colab.research.google.com/github/iceQHdrop/bioinformatic_data_mining/blob/main/ML_for_donar_finding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ML for donar finding

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! cp -r drive/MyDrive/Colab_Notebooks/bioinformatic_data_mining/dataset ./

In [70]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import re
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn

## 数据读取

### 设定路径名

In [19]:
train_path = 'dataset/TrainingSet'
test_path = 'dataset/TestingSet'    
train_files = os.listdir(train_path)
test_files = os.listdir(test_path)


### 读取训练数据

#### 定义读取函数

In [20]:
def LoadData(files, file_path):
    site_seqs = []
    normal_seqs = []

    print(f'Loading data from {file_path}...')

    for file in tqdm(files):
        with open(file_path + '/' + file, 'r') as f:
            text = f.readlines()
            site_positions = re.findall('(\d+)(?=,)', text[1])    # 提取位置
            seq = ''.join(text[2:]).replace('\n', '').lower()
        
            for position in site_positions:
                site_seqs.append(seq[int(position) - 4:int(position) + 5])
        
            for num in range(4):    # 提取非位点序列
                normal_position = np.random.randint(len(seq) - 9)    # 采样至倒数第九位
                normal_seq = seq[normal_position - 4:normal_position + 5]

                while normal_position in site_positions or \
                    'n' in normal_seq or \
                    len(normal_seq) != 9:    # 排除donar位点与'n'

                    normal_position = np.random.randint(len(seq) - 9)
                    normal_seq = seq[normal_position - 4:normal_position + 5]

                normal_seqs.append(normal_seq)
    
    site_df = pd.DataFrame(list(zip(site_seqs, np.ones(len(site_seqs)))),    # 位点
                            columns = ['Seq', 'Donar'])

    normal_df = pd.DataFrame(list(zip(normal_seqs, np.zeros(len(normal_seqs)))),    # 非位点
                              columns = ['Seq', 'Donar'])

    df = pd.concat([site_df, normal_df]).reset_index(drop = True)

    return df
  

#### 读取数据   

In [48]:
train_df = LoadData(train_files, train_path)
test_df = LoadData(test_files, test_path)

Loading data from dataset/TrainingSet...


100%|██████████| 462/462 [00:00<00:00, 5306.47it/s]


Loading data from dataset/TestingSet...


100%|██████████| 570/570 [00:00<00:00, 11556.73it/s]


## 特征提取

### One-hot 编码

In [85]:
X_train = train_df['Seq'].str.split('', expand = True).iloc[:, 1:10]
X_train = pd.get_dummies(X_train)
y_train = train_df['Donar']
X_train.head()

Unnamed: 0,1_a,1_c,1_g,1_t,2_a,2_c,2_g,2_t,3_a,3_c,...,7_g,7_t,8_a,8_c,8_g,8_t,9_a,9_c,9_g,9_t
0,0,0,1,0,0,0,1,0,0,0,...,1,0,1,0,0,0,0,0,1,0
1,0,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
2,0,0,0,1,0,1,0,0,0,1,...,1,0,1,0,0,0,0,0,1,0
3,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [86]:
X_test = test_df['Seq'].str.split('', expand = True).iloc[:, 1:10]
X_test = pd.get_dummies(X_test)
y_test = test_df['Donar']
X_test.head()

Unnamed: 0,1_a,1_c,1_g,1_t,2_a,2_c,2_g,2_t,3_a,3_c,...,7_g,7_t,8_a,8_c,8_g,8_t,9_a,9_c,9_g,9_t
0,0,1,0,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
1,0,0,1,0,0,0,1,0,1,0,...,1,0,1,0,0,0,0,0,1,0
2,0,1,0,0,1,0,0,0,1,0,...,1,0,1,0,0,0,0,0,1,0
3,1,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
4,0,0,0,1,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0


## SVM

In [87]:
clf = SVC(probability = True)

In [94]:
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
fpr, tpr, _ = roc_curve(y_test, y_pred[:, 1])
auc_roc = auc(fpr, tpr)
auc_roc

0.99817167075939