# Regression

In [1]:
import pandas as pd
import numpy as np
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
genomes = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="genomics_cannabis")

In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "genomics_cannabis")
bq_assistant.list_tables()

['MNPR01_201703',
 'MNPR01_reference_201703',
 'MNPR01_transcriptome_201703',
 'cs10_gff',
 'cs3k_project_info',
 'cs3k_vcf_cs10_dv090',
 'sample_info_201703']

In [3]:
query = """SELECT
  variant_id, alternate_bases, quality, type, 
  AB, ABP, AC, AF, AN, AO, DP, DPB, DPRA, EPP, 
  EPPR, GTI, MQM, MQMR, NS, NUMALT, ODDS, PAO,
  PQA, PQR, PRO, QA, QR, RO, RPL, RPP, RPPR, 
  RPR, RUN, SAF, SAP, SAR, SRF, SRP, SRR
FROM
  `bigquery-public-data.genomics_cannabis.MNPR01_201703` v
limit 10000000;"""
response = genomes.query_to_pandas_safe(query, max_gb_scanned=100)
response.head(10)



Unnamed: 0,variant_id,alternate_bases,quality,type,AB,ABP,AC,AF,AN,AO,...,RPP,RPPR,RPR,RUN,SAF,SAP,SAR,SRF,SRP,SRR
0,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTAwNDZ8Z2J8TU5QUj...,[TTTGG],59.165,[complex],[0.0],[0.0],[2],[1.0],2,[2],...,[7.35324],0.0,[0],[1],[1],[3.0103],[1],0,0.0,0
1,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTY0Mjh8Z2J8TU5QUj...,[CA],35.2293,[mnp],[0.0],[0.0],[2],[1.0],2,[2],...,[3.0103],0.0,[1],[1],[1],[3.0103],[1],0,0.0,0
2,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTY4NTZ8Z2J8TU5QUj...,[G],59.1733,[snp],[0.0],[0.0],[2],[1.0],2,[2],...,[7.35324],0.0,[0],[1],[1],[3.0103],[1],0,0.0,0
3,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTIwNjR8Z2J8TU5QUj...,[TAA],49.6968,[complex],[0.0],[0.0],[2],[1.0],2,[2],...,[3.0103],0.0,[1],[1],[1],[3.0103],[1],0,0.0,0
4,CKXG8eKP9qOf8wESIGdpfDEwOTg0ODg3OTF8Z2J8TU5QUj...,[C],61.9962,[snp],[0.0],[0.0],[2],[1.0],2,[2],...,[7.35324],0.0,[2],[1],[1],[3.0103],[1],0,0.0,0
5,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTE3NTB8Z2J8TU5QUj...,[TTTTGT],35.7113,[complex],[0.0],[0.0],[2],[1.0],2,[2],...,[7.35324],0.0,[2],[1],[2],[7.35324],[0],0,0.0,0
6,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTcwNTF8Z2J8TU5QUj...,[A],6.2269,[snp],[0.0],[0.0],[2],[1.0],2,[2],...,[3.0103],0.0,[1],[1],[1],[3.0103],[1],0,0.0,0
7,CKXG8eKP9qOf8wESIGdpfDEwOTg0ODQ0OTJ8Z2J8TU5QUj...,[C],57.2772,[snp],[0.0],[0.0],[2],[1.0],2,[2],...,[7.35324],0.0,[0],[1],[1],[3.0103],[1],0,0.0,0
8,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTU5MTN8Z2J8TU5QUj...,[C],57.1536,[snp],[0.0],[0.0],[2],[1.0],2,[2],...,[7.35324],0.0,[2],[1],[1],[3.0103],[1],0,0.0,0
9,CKXG8eKP9qOf8wESIGdpfDEwOTg0OTQ2Mzd8Z2J8TU5QUj...,[A],58.6606,[snp],[0.0],[0.0],[2],[1.0],2,[2],...,[7.35324],0.0,[0],[1],[1],[3.0103],[1],0,0.0,0


I shall run regression to determine my DNA samples based on proteins. 

In [4]:
response["DNA_A"] = response["alternate_bases"].str[0].str.count('A')

In [5]:
response["DNA_C"] = response["alternate_bases"].str[0].str.count('C')

In [6]:
response["DNA_G"] = response["alternate_bases"].str[0].str.count('G')

In [7]:
response["DNA_T"] = response["alternate_bases"].str[0].str.count('T')

Now I shall adjust the proteins for easy analysis. 

In [8]:
proteins = ["AB", "ABP", "AC", "AF", "AO", "DPRA", "EPP", "MQM", "PAO",
            "PQA", "QA", "RPL", "RPP", 
            "RPR", "RUN", "SAF", "SAP", "SAR", 
]

for protein in proteins:
    print(protein)
    response[protein] = response[protein].str[0]

AB
ABP
AC
AF
AO
DPRA
EPP
MQM
PAO
PQA
QA
RPL
RPP
RPR
RUN
SAF
SAP
SAR


In [9]:
response["type"] = response["type"].str[0]

In [10]:
response["type"].value_counts()

type
snp        7206678
complex    1517891
mnp         562705
ins         418708
del         294018
Name: count, dtype: int64

```type``` is a categorical variable, so I shall need to dummify it. 

In [11]:
dummies = pd.get_dummies(response["type"], dtype="int")
dummies

Unnamed: 0,complex,del,ins,mnp,snp
0,1,0,0,0,0
1,0,0,0,1,0
2,0,0,0,0,1
3,1,0,0,0,0
4,0,0,0,0,1
...,...,...,...,...,...
9999995,0,0,0,0,1
9999996,0,0,0,0,1
9999997,0,0,0,1,0
9999998,0,0,1,0,0


Now I shall define what I am regressing upon.

In [12]:
y = response[["DNA_A", "DNA_C", "DNA_G", "DNA_T"]]

In [13]:
X = pd.concat([
    response[["AB", "ABP", "AC", "AF", "AN", "AO", "DP", "DPB", "DPRA", "EPP", 
              "EPPR", "GTI", "MQM", "MQMR", "NS", "NUMALT", "ODDS", "PAO",
              "PQA", "PQR", "PRO", "QA", "QR", "RO", "RPL", "RPP", "RPPR", 
              "RPR", "RUN", "SAF", "SAP", "SAR", "SRF", "SRP", "SRR"
    ]], 
    dummies
], axis=1)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=1337)
LR = LinearRegression()
LR.fit(X_train, y_train)
train_score = LR.score(X_train, y_train)
test_score = LR.score(X_test, y_test)
print(f'Train score {train_score}, test score {test_score}')

Train score 0.17688309627908458, test score 0.1715368029537846


I've achieved a test score of 17.2%.

In [16]:
X.columns

Index(['AB', 'ABP', 'AC', 'AF', 'AN', 'AO', 'DP', 'DPB', 'DPRA', 'EPP', 'EPPR',
       'GTI', 'MQM', 'MQMR', 'NS', 'NUMALT', 'ODDS', 'PAO', 'PQA', 'PQR',
       'PRO', 'QA', 'QR', 'RO', 'RPL', 'RPP', 'RPPR', 'RPR', 'RUN', 'SAF',
       'SAP', 'SAR', 'SRF', 'SRP', 'SRR', 'complex', 'del', 'ins', 'mnp',
       'snp'],
      dtype='object')

In [17]:
LR.coef_

array([[ 1.19606821e-01,  2.39259210e-04, -3.40445774e+07,
         6.80891549e+07, -5.11617497e+06,  5.62218980e+07,
         1.36947306e-03,  5.55778300e-04,  8.52737537e+05,
        -6.22331614e-05, -1.14092982e-04,  4.38978236e-01,
        -9.91065754e-05, -1.45386589e-03,  1.74872673e+05,
         3.73331512e-02, -4.88673177e-05,  4.56050591e-02,
        -1.31547307e-03,  1.66410987e-03, -4.91242789e-02,
        -3.33295613e-04,  3.34068018e-04,  7.41586337e+05,
        -2.58394426e+07, -5.22700123e-04, -1.07153541e-03,
        -2.58394426e+07, -5.80990498e+01, -3.03824554e+07,
         4.65287364e-04, -3.03824554e+07, -7.41586349e+05,
         5.34175498e-04, -7.41586353e+05, -4.73667288e+03,
        -4.73688813e+03, -4.73616949e+03, -4.73776676e+03,
        -4.73826150e+03],
       [-2.12299488e-01, -3.12614980e-04,  3.60790484e+07,
        -7.21580968e+07,  5.37755707e+06, -7.19035780e+06,
        -1.07668576e-04,  7.00616389e-04, -8.97975720e+05,
        -3.04856825e-06, -1.07

In [18]:
coeffs = pd.DataFrame(data=LR.coef_, columns=X.columns, index=y.columns)

In [19]:
coeffs.transpose()

Unnamed: 0,DNA_A,DNA_C,DNA_G,DNA_T
AB,0.1196068,-0.2122995,-0.2135807,0.1141504
ABP,0.0002392592,-0.000312615,-0.0002761949,-0.0003003659
AC,-34044580.0,36079050.0,29341090.0,-382280.8
AF,68089150.0,-72158100.0,-58682190.0,764561.6
AN,-5116175.0,5377557.0,4377782.0,-24742.35
AO,56221900.0,-7190358.0,-11177880.0,-38000300.0
DP,0.001369473,-0.0001076686,4.457776e-05,0.001050271
DPB,0.0005557783,0.0007006164,0.0005778221,0.0005928221
DPRA,852737.5,-897975.7,-730856.0,5357.076
EPP,-6.223316e-05,-3.048568e-06,2.34647e-05,0.0007089252


The strongest coefficients are the proteins AO, RPL, RPR, SAF, and SAR. 

Notably, there is also a strong correlation between all four of the DNA types in terms of protein coefficients between them. 