# CNN analysis

In [1]:
import pandas as pd
import numpy as np
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
genomes = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="genomics_cannabis")

In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "genomics_cannabis")
bq_assistant.list_tables()

['MNPR01_201703',
 'MNPR01_reference_201703',
 'MNPR01_transcriptome_201703',
 'cs10_gff',
 'cs3k_project_info',
 'cs3k_vcf_cs10_dv090',
 'sample_info_201703']

In [3]:
query = """SELECT
  variant_id, alternate_bases, quality, type, 
  AB, ABP, AC, AF, AN, AO, DP, DPB, DPRA, EPP, 
  EPPR, GTI, MQM, MQMR, NS, NUMALT, ODDS, PAO,
  PQA, PQR, PRO, QA, QR, RO, RPL, RPP, RPPR, 
  RPR, RUN, SAF, SAP, SAR, SRF, SRP, SRR
FROM
  `bigquery-public-data.genomics_cannabis.MNPR01_201703` v
limit 100000;"""
response = genomes.query_to_pandas_safe(query, max_gb_scanned=100)
response.head(10)



Unnamed: 0,variant_id,alternate_bases,quality,type,AB,ABP,AC,AF,AN,AO,...,RPP,RPPR,RPR,RUN,SAF,SAP,SAR,SRF,SRP,SRR
0,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTQ0OTl8Z2J8TU5QUj...,[ATTTTTTTA],14.6305,[del],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[1],[5.18177],[0],0,0.0,0
1,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTU3Mjl8Z2J8TU5QUj...,[TGAAAAAAAT],22.9505,[del],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[1],[5.18177],[0],0,0.0,0
2,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTcwNjJ8Z2J8TU5QUj...,[ACTA],19.647,[snp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[1],[1],[1],[5.18177],[0],0,0.0,0
3,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTU1MDZ8Z2J8TU5QUj...,[ACACTCT],4.75779,[snp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[0],[5.18177],[1],0,0.0,0
4,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTUyNjd8Z2J8TU5QUj...,[ATTTTTAA],20.4892,[snp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[1],[5.18177],[0],0,0.0,0
5,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTY4MDl8Z2J8TU5QUj...,[GCTC],16.4533,[snp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[0],[5.18177],[1],0,0.0,0
6,CKXG8eKP9qOf8wESIGdpfDEwOTg0ODYyNjd8Z2J8TU5QUj...,[ACTGAATGAATA],9.1078,[complex],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[0],[5.18177],[1],0,0.0,0
7,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTU2NDR8Z2J8TU5QUj...,[GA],8.21977,[snp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[0],[5.18177],[1],0,0.0,0
8,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTY3NzF8Z2J8TU5QUj...,[TTTC],14.1353,[mnp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[1],[1],[1],[5.18177],[0],0,0.0,0
9,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTM5MDB8Z2J8TU5QUj...,[GT],21.2202,[snp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[1],[5.18177],[0],0,0.0,0


In [4]:
response["DNA_A"] = response["alternate_bases"].str[0].str.count('A')
response["DNA_C"] = response["alternate_bases"].str[0].str.count('C')
response["DNA_G"] = response["alternate_bases"].str[0].str.count('G')
response["DNA_T"] = response["alternate_bases"].str[0].str.count('T')

In [5]:
proteins = ["AB", "ABP", "AC", "AF", "AO", "DPRA", "EPP", "MQM", "PAO",
            "PQA", "QA", "RPL", "RPP", "type",
            "RPR", "RUN", "SAF", "SAP", "SAR", 
]

for protein in proteins:
    response[protein] = response[protein].str[0]

In [6]:
dummies = pd.get_dummies(response["type"], dtype="int")

In [7]:
X = pd.concat([
    response[["AB", "ABP", "AC", "AF", "AN", "AO", "DP", "DPB", "DPRA", "EPP", 
              "EPPR", "GTI", "MQM", "MQMR", "NS", "NUMALT", "ODDS", "PAO",
              "PQA", "PQR", "PRO", "QA", "QR", "RO", "RPL", "RPP", "RPPR", 
              "RPR", "RUN", "SAF", "SAP", "SAR", "SRF", "SRP", "SRR"
    ]], 
    dummies
], axis=1)
y = response[["DNA_A", "DNA_C", "DNA_G", "DNA_T"]]

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import torch
import torch.nn as nn

from tqdm.notebook import tnrange, tqdm_notebook

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=1337)

In [11]:
X.shape

(100000, 40)