# Neural net

In [14]:
import pandas as pd
import numpy as np
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Layer, Dense, Dropout, Bidirectional, Embedding, LSTM, GRU, BatchNormalization, TextVectorization, Flatten
from tensorflow.keras.optimizers import Adam
genomes = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="genomics_cannabis")

To swiftly go through the dataset:

In [15]:
bq_assistant = BigQueryHelper("bigquery-public-data", "genomics_cannabis")
bq_assistant.list_tables()

['MNPR01_201703',
 'MNPR01_reference_201703',
 'MNPR01_transcriptome_201703',
 'cs10_gff',
 'cs3k_project_info',
 'cs3k_vcf_cs10_dv090',
 'sample_info_201703']

In [16]:
query = """SELECT
  variant_id, alternate_bases, quality, type, 
  AB, ABP, AC, AF, AN, AO, DP, DPB, DPRA, EPP, 
  EPPR, GTI, MQM, MQMR, NS, NUMALT, ODDS, PAO,
  PQA, PQR, PRO, QA, QR, RO, RPL, RPP, RPPR, 
  RPR, RUN, SAF, SAP, SAR, SRF, SRP, SRR
FROM
  `bigquery-public-data.genomics_cannabis.MNPR01_201703` v
limit 1000000;"""
response = genomes.query_to_pandas_safe(query, max_gb_scanned=100)
response.head(10)



Unnamed: 0,variant_id,alternate_bases,quality,type,AB,ABP,AC,AF,AN,AO,...,RPP,RPPR,RPR,RUN,SAF,SAP,SAR,SRF,SRP,SRR
0,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTY4MDF8Z2J8TU5QUj...,[TCGAAG],24.7787,[snp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[0],[5.18177],[1],0,0.0,0
1,CKXG8eKP9qOf8wESIGdpfDEwOTg0ODk3ODZ8Z2J8TU5QUj...,[TTTTGC],17.8768,[mnp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[1],[5.18177],[0],0,0.0,0
2,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTY3OTJ8Z2J8TU5QUj...,[TTCAATTTGT],29.5205,[complex],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[1],[5.18177],[0],0,0.0,0
3,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTU3NTR8Z2J8TU5QUj...,[ATAG],3.77924,[snp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[1],[1],[0],[5.18177],[1],0,0.0,0
4,CKXG8eKP9qOf8wESIGdpfDEwOTg0ODk1NjR8Z2J8TU5QUj...,[TAAA],24.8904,[snp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[0],[5.18177],[1],0,0.0,0
5,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTMwNDB8Z2J8TU5QUj...,[GAATTA],21.5174,[complex],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[1],[1],[0],[5.18177],[1],0,0.0,0
6,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTQ5MTB8Z2J8TU5QUj...,[TAAAAAAT],28.7716,[del],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[0],[5.18177],[1],0,0.0,0
7,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTYwNTl8Z2J8TU5QUj...,[GAAAA],13.9416,[complex],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[0],[5.18177],[1],0,0.0,0
8,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTcwNzZ8Z2J8TU5QUj...,[TGGCT],10.7259,[complex],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[0],[1],[0],[5.18177],[1],0,0.0,0
9,CKXG8eKP9qOf8wESIGdpfDEwOTg0ODQ3ODB8Z2J8TU5QUj...,[GA],13.9337,[snp],[0.0],[0.0],[2],[1.0],2,[1],...,[5.18177],0.0,[1],[1],[1],[5.18177],[0],0,0.0,0


To process the DNA strands:

In [17]:
response["DNA_A"] = response["alternate_bases"].str[0].str.count('A')
response["DNA_C"] = response["alternate_bases"].str[0].str.count('C')
response["DNA_G"] = response["alternate_bases"].str[0].str.count('G')
response["DNA_T"] = response["alternate_bases"].str[0].str.count('T')

To correct protein data:

In [18]:
proteins = ["AB", "ABP", "AC", "AF", "AO", "DPRA", "EPP", "MQM", "PAO",
            "PQA", "QA", "RPL", "RPP", "type",
            "RPR", "RUN", "SAF", "SAP", "SAR", 
]

for protein in proteins:
    response[protein] = response[protein].str[0]

To dummify the types:

In [19]:
dummies = pd.get_dummies(response["type"], dtype="int")

Here, I shall define my X and y. 

In [20]:
X = pd.concat([
    response[["AB", "ABP", "AC", "AF", "AN", "AO", "DP", "DPB", "DPRA", "EPP", 
              "EPPR", "GTI", "MQM", "MQMR", "NS", "NUMALT", "ODDS", "PAO",
              "PQA", "PQR", "PRO", "QA", "QR", "RO", "RPL", "RPP", "RPPR", 
              "RPR", "RUN", "SAF", "SAP", "SAR", "SRF", "SRP", "SRR"
    ]], 
    dummies
], axis=1)
y = response[["DNA_A", "DNA_C", "DNA_G", "DNA_T"]]

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import torch
import torch.nn as nn

from tqdm.notebook import tnrange, tqdm_notebook

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=1337)

In [23]:
X.shape

(1000000, 40)

In [24]:
X_train

Unnamed: 0,AB,ABP,AC,AF,AN,AO,DP,DPB,DPRA,EPP,...,SAP,SAR,SRF,SRP,SRR,complex,del,ins,mnp,snp
999016,0.0,0.0,2,1.0,2,2,2,2.000,0,7.35324,...,7.35324,0,0,0.00000,0,0,0,0,0,1
581017,0.0,0.0,2,1.0,2,2,2,2.000,0,3.01030,...,7.35324,0,0,0.00000,0,0,0,0,0,1
46168,0.0,0.0,2,1.0,2,8,9,9.000,0,7.35324,...,20.38210,8,0,0.00000,0,0,0,0,1,0
770720,0.0,0.0,0,0.0,2,2,6,6.000,0,7.35324,...,7.35324,2,4,11.69620,0,0,0,0,0,1
5842,0.0,0.0,2,1.0,2,2,2,2.000,0,7.35324,...,3.01030,1,0,0.00000,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480729,0.0,0.0,2,1.0,2,2,2,2.000,0,7.35324,...,7.35324,2,0,0.00000,0,0,0,0,0,1
216231,0.0,0.0,2,1.0,2,7,10,10.000,0,3.32051,...,18.21060,7,0,0.00000,0,0,0,0,1,0
795880,0.0,0.0,2,1.0,2,2,6,7.375,0,3.01030,...,7.35324,2,0,5.18177,1,1,0,0,0,0
975037,0.0,0.0,2,1.0,2,2,2,2.000,0,3.01030,...,3.01030,1,0,0.00000,0,0,0,0,0,1


In [25]:
X_train = torch.tensor(X_train.to_numpy(dtype='float32'), dtype=torch.float32)
X_test = torch.tensor(X_test.to_numpy(dtype='float32'), dtype=torch.float32)

y_train = torch.tensor(y_train.to_numpy(dtype='float32'), dtype=torch.long)
y_test = torch.tensor(y_test.to_numpy(dtype='float32'), dtype=torch.long)

In [26]:
x = X_train[0:8, :]
target = y_train[0:8].to(torch.float32)

neural_net = nn.Sequential(nn.Linear(40, 4),
                           nn.ReLU(),
                           nn.Linear(4, 1))
mse_loss = nn.MSELoss()

optimizer = torch.optim.SGD(neural_net.parameters(), lr=0.1)

output_values = neural_net(x)
loss = mse_loss(output_values, target)

for _ in range(0, 1000):
    loss.backward()
    optimizer.step()

    new_output = neural_net(x)
    new_loss = mse_loss(new_output, target)
    print(f"Old loss: {loss}; New loss: {new_loss}")
    loss = new_loss

  return F.mse_loss(input, target, reduction=self.reduction)


Old loss: 1.543215036392212; New loss: 1.4304893016815186
Old loss: 1.4304893016815186; New loss: 1.320070743560791
Old loss: 1.320070743560791; New loss: 1.2568469047546387
Old loss: 1.2568469047546387; New loss: 1.2979817390441895
Old loss: 1.2979817390441895; New loss: 1.413811445236206
Old loss: 1.413811445236206; New loss: 1.509568214416504
Old loss: 1.509568214416504; New loss: 1.5124766826629639
Old loss: 1.5124766826629639; New loss: 1.4203264713287354
Old loss: 1.4203264713287354; New loss: 1.3031518459320068
Old loss: 1.3031518459320068; New loss: 1.2500052452087402
Old loss: 1.2500052452087402; New loss: 1.3012783527374268
Old loss: 1.3012783527374268; New loss: 1.4180033206939697
Old loss: 1.4180033206939697; New loss: 1.511469841003418
Old loss: 1.511469841003418; New loss: 1.510642409324646
Old loss: 1.510642409324646; New loss: 1.4161503314971924
Old loss: 1.4161503314971924; New loss: 1.2998077869415283
Old loss: 1.2998077869415283; New loss: 1.2500346899032593
Old loss

This is a low amount of loss. 