### module

In [2]:
import logging
import numpy as np
import pickle
import os
import gzip
import ntpath
import metax.WeightDBUtilities as WeightDBUtilities
import metax.PrediXcanFormatUtilities as PrediXcanFormatUtilities
import metax.ThousandGenomesUtilities as ThousandGenomesUtilities
import metax.Logging as Logging
import metax.Utilities as Utilities
import metax.Formats as Formats
import readline
import sys
import os
import numpy as np
import pandas as pd
import sqlite3
from fileinput import filename
from sys import argv




### Functions 

In [13]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except sqlite3.Error as e:
        print(e)
        sys.exit(1)
    return None

# function of matching two lists
def match_list(a, b):
    """ find the indices of matching element 
    :param  a: list a 
            b: list b
    :return: The indices of elements in list a in list b 
    """
    return np.array([ b.index(x) if x in b else -1 for x in a ])

# install r packages
def r_requirement():
    """ Install the required R packages
    """
    utils = rpackages.importr('utils')
    # select a mirror for R packages
    utils.chooseCRANmirror(ind=1) # select the first mirror in the list

    # install R packages     
    importr_try("GBJ")
    # multiple packages installation
    # from rpy2.robjects.vectors import StrVector 
    # if len(package_name) > 0:
    #     utils.install_packages(StrVector(package_name))

def importr_try(pkgname):
    """ Load the R packages
    :param  pkgname: the name of the package
    """
    utils = rpackages.importr('utils')
    try:
        rpack = importr(pkgname)
    except RRuntimeError:
        utils.install_packages(pkgname)
        rpack = importr(pkgname)
    return rpack 


### args

In [None]:
nstart = int(args.start_gene_index)
    
# index of ending task
nend = int(args.end_gene_index)

# single mask dir
single_mask_dir = args.input_folder

# info file
info_file = args.gene_info

# database dir
db_dir = args.weight_db

# covariance dir 
cov_dir = args.cov_dir

# output dir 
out_dir = args.output_dir

# read list of genes
gene_info = pd.read_table(info_file)

# output name
output_name = args.output_name

In [11]:
nstart = 1
    
nend = 15605

# single mask dir
single_mask_dir = "/Users/jerome/Projects/UTMOST/database/mask/SCZ/"

# info file
info_file = "/Users/jerome/Projects/UTMOST/database/gene_info/gene_info.txt" 

# database dir
db_dir = "/Users/jerome/Projects/UTMOST/database/weight_db/"

# covariance dir 
cov_dir = "/Users/jerome/Projects/UTMOST/database/cov/1226/"

# output dir 
out_dir = "/Users/jerome/Projects/UTMOST/results/"

# read list of genes
gene_info = pd.read_table(info_file)

# output name
output_name = "scz"


### main function

In [16]:
P = nend - nstart + 1
gene_ensg = gene_info["gene_ensg"].copy()
gene_id = gene_info["gene_ensg"].copy()
gene_name = gene_info["gene_ensg"].copy()
gene_ensg = sorted(list(gene_ensg))
gene_id = sorted(list(gene_id))
gene_name = sorted(list(gene_name))

# read z-score file
logging.info("Read in z-score files")

# directory of z-score
os.chdir(single_mask_dir) 

# search for files ending with .csv
fi = []

for file in sorted(os.listdir(single_mask_dir)):
    if file.endswith(".csv"):
        fi.append(file)
logging.info(str(len(fi)) + " files in total.")
N = len(fi)    
zscore_dict = {}
for i in range(N):
    nam = "zscore_" + str(i+1)
    zscore_dict[nam] = pd.read_csv(fi[i], header = "infer")


# output file: list of test score and p-value
logging.info("compute p-value for genes")
#directory of db
os.chdir(db_dir) 
# initialize the outcome matrix
outcome = pd.DataFrame(np.zeros(shape =(P,48)))
outcome.loc[:,0] = gene_id[(nstart-1):nend]
outcome.loc[:,1] = gene_name[(nstart-1):nend]
outcome = outcome.rename(columns={0:"gene_id",1:"gene_name"})

# read the database 
fi = []
for file in sorted(os.listdir(db_dir)):
    if file.endswith(".db"):
        fi.append(file)

# calculation        
for k in range(P):
    logging.info("Gene: " + str(k + nstart))
    gene = gene_ensg[k + nstart -1]
    if k % 150 ==0:
        print("Processed:  %" + str(round(float(k)/float(P),2) * 100))
    print(gene)
    #read snp list
    #snp_rsid

    try:
        filename = cov_dir + "/"+ gene + ".snplist"
        snp_rsid = pd.read_table(filename, header = None)
    except:
        continue
    snp_rsid = list(snp_rsid.loc[:,0])

    #matrix of weights
    M = len(snp_rsid) #number of snps
    logging.info("Number of SNPs: " + str(M))
    weights = np.zeros(shape = (M, N))
    for i in range(N):
        #logging.info("Database: " + str(i+1))
        dbname = fi[i]
        conn = create_connection(dbname)
        cur = conn.cursor()  
        sql_q = 'select * from weights where gene = "' + gene + '"'
        tmp_query = cur.execute(sql_q).fetchall()
        rsid_in_db = list(map(lambda x: str(x[0]), tmp_query))
        #rsid_in_db = map(lambda x: str(x[0]), tmp_query)
        index = match_list(rsid_in_db, snp_rsid)
        indi = index[index > -1]
        # extract the weight
        sql_q = 'select * from weights where gene = "' + gene + '"'
        tmp_query = cur.execute(sql_q).fetchall()
        tmp_weights = np.array(list(map(lambda x: str(x[2]), tmp_query)))
        #tmp_weights = np.array(map(lambda x: str(x[2]), tmp_query))
        if sum(index > -1) > 0:
            weights[indi,i] = tmp_weights[index > -1]

    # covariance matrix of snps
    cov_file = cov_dir + "/" + gene_id[k + nstart - 1] + ".cov"
    cov_matrix = np.loadtxt(cov_file)

    # covariance matrix of gene in different tissue
    cov_gene = np.mat(weights.T) * np.mat(cov_matrix) * np.mat(weights)
    cov_gene = np.array(cov_gene)

    # normalization
    for i in range(N):
        if cov_gene[i,i] != 0:
            cov_gene[i,:] = cov_gene[i,:] / np.sqrt(cov_gene[i,i])
            cov_gene[:,i] = cov_gene[:,i] / cov_gene[i,i]

    #z-score of gene in different tissue
    zscore_gene = np.full([N, 1], np.nan)   
    for i in range(N):
        nam = "zscore_" + str(i+1)
        index = zscore_dict[nam]["gene"] == gene
        if sum(index) > 0:
            zscore_gene[i] = zscore_dict[nam]["zscore"][index].values[0]
            #p-value
            outcome.loc[k, (i+4)] = float(zscore_dict[nam]["pvalue"][index].values[0])

    #only keep tissues with prediction model for gene
    index = np.isnan(zscore_gene) == False
    indext = index.T[0]
    if sum(index) > 0:
        zscore_gene = zscore_gene[index]
        cov_gene = cov_gene[indext,:][:,indext]
    else:
        # test cannot be done
        continue
    if not np.allclose(cov_gene,cov_gene.T):
        print(gene)

Processed:  %0.0
A1BG
A1CF
A2M
A2ML1
A3GALT2
A4GALT
A4GNT
AAAS
AADAC
AADACL2
AADACL4
AADAT
AAED1
AAGAB
AAK1
AAMDC
AAMP
AANAT
AAR2
AARD
AARS
AARS2
AASDH
AASDHPPT
AASS
AATF
AATK
ABAT
ABCA1
ABCA10
ABCA12
ABCA13
ABCA3
ABCA4
ABCA5
ABCA6
ABCA7
ABCA8
ABCA9
ABCB1
ABCB10
ABCB11
ABCB4
ABCB5
ABCB8
ABCB9
ABCC1
ABCC10
ABCC12
ABCC2
ABCC3
ABCC4
ABCC5
ABCC6
ABCC8
ABCC9
ABCD2
ABCD3
ABCD4
ABCE1
ABCF2
ABCF3
ABCG2
ABCG4
ABCG5
ABCG8
ABHD1
ABHD10
ABHD11
ABHD12
ABHD12B
ABHD14B
ABHD15
ABHD16B
ABHD17A
ABHD17B
ABHD2
ABHD3
ABHD4
ABHD6
ABHD8
ABI1
ABI2
ABI3
ABI3BP
ABL2
ABLIM1
ABLIM3
ABRA
ABRACL
ABT1
ABTB1
ABTB2
ACAA1
ACAA2
ACACB
ACAD10
ACAD8
ACAD9
ACADL
ACADM
ACADS
ACADSB
ACADVL
ACAN
ACAP1
ACAP2
ACAP3
ACAT1
ACAT2
ACBD3
ACBD4
ACBD5
ACBD6
ACCS
ACCSL
ACD
ACE
ACER1
ACER3
ACHE
ACIN1
ACKR2
ACKR3
ACKR4
ACLY
ACMSD
ACN9
ACO1
ACO2
ACOT1
ACOT11
ACOT12
ACOT13
ACOT2
ACOT4
ACOT6
ACOT7
ACOT8
ACOX3
ACOXL
ACP1
ACP2
ACP5
ACP6
ACPL2
ACPP
ACPT
ACR
ACRBP
Processed:  %1.0
ACRV1
ACSBG1
ACSBG2
ACSF2
ACSF3
ACSL1
ACSL3
ACSL6
ACSM1
ACSM2A
A

BFSP2
BGLAP
BHLHA15
BHLHE22
BHLHE23
BHLHE40
BHMT
BHMT2
BICC1
BICD1
BICD2
BID
BIK
BIN1
BIN3
BIRC2
BIRC3
BIRC5
BIRC6
BIRC7
BIRC8
BIVM
BLACE
BLCAP
BLK
BLM
BLNK
BLOC1S1
BLOC1S2
BLOC1S4
BLOC1S5
BLOC1S6
BLVRA
BLVRB
BLZF1
BMF
BMP10
BMP2K
BMP3
BMP4
BMP5
BMP7
BMP8B
BMPER
BMPR1A
BMPR1B
BMPR2
BMS1
BNC1
BNC2
BNIP1
BNIP2
BNIP3
BNIP3L
BNIPL
BOC
BOD1
BOD1L1
BOD1L2
BOK
BOLA1
BOLA2
BOLA2B
BOLA3
BOLL
BOP1
BORA
BPGM
BPHL
BPI
BPIFA1
BPIFA2
BPIFA3
BPIFB1
BPIFB2
BPIFB3
BPIFB4
BPIFB6
BPIFC
BPNT1
BPTF
BRAF
BRAP
BRAT1
BRCA1
BRCA2
BRD1
BRD3
BRD4
BRD7
BRD8
BRDT
BRE
BRF1
BRF2
BRI3
BRI3BP
BRICD5
BRINP1
BRINP2
BRINP3
BRIP1
BRIX1
BRK1
BRMS1
BRMS1L
BROX
BRPF1
BRPF3
BRSK1
BRSK2
BRWD1
BSCL2
Processed:  %9.0
BSG
BSN
BSND
BSPRY
BST1
BST2
BTAF1
BTBD1
BTBD10
BTBD11
BTBD16
BTBD17
BTBD18
BTBD19
BTBD2
BTBD3
BTBD6
BTBD7
BTBD8
BTBD9
BTC
BTD
BTF3
BTG1
BTG3
BTLA
BTN1A1
BTN2A1
BTN2A2
BTN3A1
BTN3A2
BTNL3
BTNL8
BTNL9
BTRC
BUB1
BUB1B
BUB3
BVES
BYSL
BZRAP1
BZW1
C10orf105
C10orf107
C10orf11
C10orf111
C10orf113
C10orf115
C10orf118
C10or

CCDC9
CCDC90B
CCDC91
CCDC92
CCDC93
CCDC94
CCDC96
CCER1
CCER2
CCIN
CCKBR
CCL1
CCL11
CCL13
CCL14
CCL15
CCL16
CCL17
CCL19
CCL20
CCL22
CCL23
CCL24
CCL25
CCL26
CCL27
CCL28
CCL3
CCL3L1
CCL3L3
CCL4
CCL4L1
CCL4L2
CCL5
CCL7
CCL8
CCM2
CCM2L
CCNA1
CCNA2
CCNB1IP1
CCNB2
CCNC
CCND1
CCND3
CCNDBP1
CCNE1
CCNE2
CCNG1
CCNG2
CCNH
CCNI
CCNI2
CCNJ
CCNJL
CCNK
CCNL1
CCNL2
CCNO
CCNT2
CCP110
CCPG1
CCR1
CCR2
CCR3
CCR4
CCR5
CCR6
CCR7
CCR8
CCR9
CCRL2
CCRN4L
CCS
CCSAP
CCSER1
CCSER2
CCT2
CCT3
CCT5
CCT6B
CCT7
CCT8
CCT8L2
CCZ1
CCZ1B
CD101
CD109
CD14
CD151
CD160
CD163
CD163L1
CD164
CD164L2
CD180
CD19
CD1A
CD1B
CD1C
Processed:  %15.0
CD1D
CD1E
CD200
CD200R1
CD200R1L
CD207
CD22
CD226
CD244
CD27
CD274
CD276
CD28
CD2AP
CD300A
CD300C
CD300E
CD300LB
CD300LD
CD300LF
CD300LG
CD302
CD320
CD33
CD34
CD36
CD37
CD38
CD3D
CD3E
CD3EAP
CD3G
CD4
CD40
CD44
CD46
CD47
CD48
CD52
CD53
CD55
CD58
CD59
CD5L
CD6
CD63
CD68
CD69
CD7
CD79B
CD80
CD82
CD83
CD84
CD86
CD8A
CD8B
CD9
CD93
CD96
CD97
CDA
CDADC1
CDAN1
CDC123
CDC14A
CDC16
CDC20
CDC20B
CDC25

DDX25
DDX27
DDX31
DDX39A
DDX4
DDX42
DDX43
DDX47
DDX49
DDX5
DDX50
DDX51
DDX52
DDX55
DDX56
DDX58
DDX59
DDX6
DDX60L
DEAF1
DEC1
DECR1
DECR2
DEDD2
DEF6
DEF8
DEFA1
DEFA1B
DEFA3
DEFA5
DEFB1
DEFB118
DEFB119
DEFB121
DEFB123
DEFB124
DEFB125
DEFB126
DEFB129
DEFB132
DEFB134
DEFB135
DEFB4A
DEFB4B
DEGS1
DEGS2
DEK
DENND1A
DENND1B
DENND2A
DENND2C
DENND2D
DENND3
DENND4A
DENND5A
DENND5B
DENND6A
DENND6B
DENR
DEPDC1
DEPDC1B
DEPDC4
DEPDC5
DEPDC7
DEPTOR
DERA
DERL1
DERL3
DES
DESI1
DET1
DEXI
DFFA
DFFB
DFNA5
DFNB31
DFNB59
DGAT1
DGAT2
DGCR14
DGCR2
DGCR6
DGCR6L
DGCR8
DGKA
DGKB
DGKD
DGKE
DGKG
DGKH
DGKQ
DGKZ
DHCR24
DHCR7
DHDDS
DHDH
DHFR
DHFRL1
DHH
DHODH
DHRS1
DHRS11
DHRS12
DHRS13
DHRS2
DHRS4
DHRS4L2
DHRS7
DHRS7B
DHRS7C
DHRS9
DHTKD1
DHX15
DHX30
DHX32
DHX33
DHX34
DHX35
DHX36
DHX37
DHX38
DHX40
DHX58
DHX8
DIAPH3
DICER1
DIDO1
DIEXF
DIMT1
DIO1
DIO2
DIO3
DIP2A
DIP2B
DIP2C
DIRAS1
DIRAS2
DIRAS3
DIRC1
DIRC3
DIS3L
DIS3L2
DISC1
DISP1
DISP2
DKK3
DKK4
DKKL1
Processed:  %24.0
DLAT
DLC1
DLD
DLEC1
DLEU1
DLEU2L
DLEU7
DLG1
DLG4
DLG5

FCF1
FCGBP
FCGR1A
FCGR1B
FCGR2B
FCGR3A
FCGR3B
FCGRT
FCHO1
FCHO2
FCHSD1
FCHSD2
FCN1
FCN2
FCN3
FCRL3
FCRL5
FCRL6
FCRLA
FCRLB
FDCSP
FDPS
FDX1
FDX1L
FDXACB1
FDXR
FECH
FEM1A
FEM1C
FEN1
FER
FERMT1
FERMT2
FERMT3
FES
FETUB
FEV
FEZ1
FEZ2
FEZF1
FFAR1
FFAR2
FFAR3
FFAR4
FGA
FGB
FGD2
FGD3
FGD4
FGF1
FGF10
FGF11
FGF12
FGF14
FGF17
FGF18
FGF19
FGF2
FGF20
FGF21
FGF22
FGF23
FGF3
FGF4
FGF6
FGF7
FGF8
FGF9
FGFBP1
FGFBP2
FGFBP3
FGFR1
FGFR1OP
FGFR1OP2
FGFR2
FGFR3
FGFR4
FGFRL1
FGG
FGGY
FGL1
FGL2
FGR
FH
FHAD1
FHIT
FHL2
FHL3
FHL5
FHOD1
FHOD3
FIBCD1
FIBIN
FIBP
FICD
FIG4
FIGLA
FIGN
Processed:  %32.0
FIGNL1
FILIP1
FILIP1L
FIS1
FITM1
FITM2
FIZ1
FJX1
FKBP10
FKBP11
FKBP14
FKBP15
FKBP1A
FKBP1B
FKBP1C
FKBP3
FKBP4
FKBP7
FKBP8
FKBP9
FKRP
FLAD1
FLCN
FLG
FLG2
FLII
FLNB
FLNC
FLRT1
FLRT2
FLRT3
FLT3
FLT4
FLVCR1
FLVCR2
FLYWCH1
FLYWCH2
FMN1
FMN2
FMNL2
FMO1
FMO2
FMO3
FMO4
FMO6P
FMOD
FN1
FN3K
FN3KRP
FNBP1
FNBP4
FNDC1
FNDC3B
FNDC4
FNDC5
FNDC7
FNDC8
FNDC9
FNIP1
FNIP2
FNTA
FNTB
FOCAD
FOLH1
FOLR1
FOLR2
FOLR3
FOLR4
FOPNL
FOSB
FOSL1
FOS

HOXB8
HOXB9
HOXC10
HOXC13
HOXC6
HOXC8
HOXC9
HOXD1
HOXD10
HOXD11
HOXD12
HOXD13
HOXD3
HOXD4
HOXD8
HOXD9
HP
HP1BP3
HPCA
HPCAL1
HPCAL4
HPD
HPDL
HPGD
HPGDS
HPR
HPS1
HPS3
HPS4
HPS5
HPS6
HPSE
HPSE2
HPX
HR
HRAS
HRASLS
HRASLS2
HRASLS5
HRC
HRCT1
HRG
HRH2
HRH3
HRH4
HRK
HRNR
HRSP12
HS1BP3
HS2ST1
HS3ST3A1
HS3ST3B1
HS3ST5
HS3ST6
HS6ST1
HS6ST3
HSBP1
HSBP1L1
HSCB
HSD11B1
HSD11B2
HSD17B1
HSD17B11
HSD17B12
HSD17B13
HSD17B14
HSD17B2
HSD17B3
HSD17B4
HSD17B6
HSD3B1
HSD3B2
HSD3B7
HSDL2
HSF1
HSF2
HSF2BP
HSPA12A
HSPA12B
HSPA13
HSPA14
HSPA4
HSPA4L
HSPA8
HSPA9
HSPB11
HSPB3
HSPB8
HSPB9
HSPBAP1
HSPBP1
HSPD1
HSPE1
HSPG2
HSPH1
HTATIP2
HTN3
HTR1A
HTR1B
HTR1D
HTR1E
HTR1F
HTR2A
HTR2B
HTR3B
HTR3C
HTR3E
HTR4
HTR5A
HTR5A-AS1
HTR6
HTR7
HTRA1
HTRA4
HTT
HUNK
HUS1
HUS1B
HVCN1
HYAL1
HYAL3
HYDIN
HYI
HYKK
HYLS1
HYOU1
HYPK
IAH1
IAPP
IBA57
IBA57-AS1
IBSP
IBTK
ICA1
ICA1L
ICAM1
ICAM2
ICAM3
ICAM5
ICK
ICOS
ICOSLG
ID1
ID4
IDE
IDH1
IDH2
IDH3A
Processed:  %40.0
IDH3B
IDI1
IDI2
IDNK
IDO1
IDO2
IDUA
IER2
IER3IP1
IER5
IFFO1
IFI16
IFI27
IFI2

LIMA1
LIMCH1
LIMD1
LIMD2
LIME1
LIMK1
LIMS1
LIMS2
LIMS3
LIN28A
LIN28B
LIN37
LIN52
LIN54
LIN7A
LIN7B
LIN7C
LIN9
LINC00346
LINC00692
LINC00696
LINC00908
LINC00923
LINC00935
LINC00955
LINC00998
LINC00999
LINC01098
LINC01100
LINC01101
LINC01118
LINC01119
LINC01124
LINGO1
LINGO3
LINGO4
LINS
LIPA
LIPC
LIPE
LIPG
LIPH
LIPI
LIPJ
LIPK
LIPM
LIPN
LIPT1
LIPT2
LITAF
LIX1
LIX1L
LLGL1
LLGL2
LLPH
LMAN1
LMAN1L
LMAN2
LMAN2L
LMBR1
LMBR1L
LMBRD1
LMBRD2
LMCD1
LMF1
LMF2
LMLN
LMNB1
LMNB2
LMO1
LMO2
LMO4
LMO7
LMOD1
LMOD3
LMX1A
LNP1
LNPEP
LNX1
LNX2
LOH12CR1
LONP1
LONP2
LONRF1
LONRF2
LOR
LOX
LOXHD1
LOXL1
LOXL2
LOXL3
LOXL4
LPA
LPAR1
LPAR2
LPAR6
LPCAT1
LPCAT2
LPCAT3
LPCAT4
LPGAT1
LPHN1
Processed:  %48.0
LPHN2
LPHN3
LPIN1
LPIN2
LPIN3
LPL
LPO
LPXN
LRAT
LRBA
LRFN1
LRFN2
LRFN3
LRFN4
LRFN5
LRG1
LRGUK
LRIF1
LRIG1
LRIG3
LRIT1
LRIT2
LRIT3
LRMP
LRP1
LRP10
LRP11
LRP12
LRP1B
LRP2
LRP2BP
LRP3
LRP4
LRP5
LRP5L
LRP6
LRP8
LRPAP1
LRPPRC
LRR1
LRRC1
LRRC10
LRRC10B
LRRC14
LRRC14B
LRRC16A
LRRC16B
LRRC17
LRRC18
LRRC19
LRRC2
LRRC20
LRRC23

NDUFA5
NDUFA6
NDUFA7
NDUFA8
NDUFA9
NDUFAF1
NDUFAF2
NDUFAF3
NDUFAF4
NDUFAF5
NDUFAF6
NDUFAF7
NDUFB1
NDUFB10
NDUFB4
NDUFB5
NDUFB8
NDUFC1
NDUFC2
NDUFS1
NDUFS2
NDUFS3
NDUFS4
NDUFS5
NDUFS6
NDUFS7
NDUFS8
NDUFV1
NDUFV3
NEB
NEBL
NECAB1
NECAB2
NECAB3
NECAP1
NECAP2
NEDD1
NEDD4L
NEDD8
NEDD9
NEFH
NEFM
NEGR1
NEIL1
NEIL2
NEIL3
NEK1
NEK10
NEK11
NEK2
NEK3
NEK4
NEK5
NEK7
NEK8
NEK9
NELFA
NELFB
NELFCD
NELL1
NEMF
NENF
NEO1
NES
NET1
NETO1
NETO2
NEU2
NEU3
NEU4
NEURL1
NEURL1B
NEURL2
NEURL4
NEUROD2
NEUROD4
NEUROG1
NEUROG2
NEUROG3
NEXN
NF1
NF2
NFASC
NFAT5
NFATC1
NFATC2IP
NFATC3
NFATC4
NFE2
NFE2L1
NFE2L2
NFE2L3
NFIA
NFIB
NFIC
NFIL3
NFIX
NFKB2
NFKBIA
NFKBIB
NFKBIE
NFKBIZ
NFRKB
NFS1
NFU1
NFX1
NFXL1
NFYA
NFYB
NFYC
NGB
NGEF
NGF
NGLY1
NGRN
NHEJ1
NHLH1
NHLH2
NHLRC1
NHLRC4
NHP2L1
NHSL1
NICN1
NID1
NID2
NIF3L1
NIFK
NIN
NINJ1
NINJ2
NINL
NIP7
NIPA1
NIPA2
NIPAL1
NIPAL2
NIPAL4
NIPSNAP1
NIPSNAP3A
NIPSNAP3B
NISCH
NIT2
NKAIN1
NKAIN2
NKAIN3
Processed:  %57.0
NKAIN4
NKAPL
NKD1
NKD2
NKG7
NKIRAS1
NKIRAS2
NKPD1
NKTR
NKX1-1
NKX1-2
NK

PIK3R2
PIK3R3
PIK3R4
PIK3R6
PILRA
PILRB
PIM1
PIM3
PIN1
PINK1
PINLYP
PINX1
PIP
PIP4K2A
PIP4K2B
PIP4K2C
PIP5K1A
PIP5K1B
PIP5K1C
PIP5KL1
PIPOX
PIRT
PISD
PITHD1
PITPNA
PITPNB
PITPNM3
PITRM1
PITX1
PITX3
PIWIL1
PIWIL2
PIWIL3
PIWIL4
PJA2
PKD1
PKD1L1
PKD2
PKD2L1
PKD2L2
PKDCC
PKDREJ
Processed:  %64.0
PKHD1L1
PKIB
PKIG
PKLR
PKM
PKMYT1
PKN2
PKN3
PKNOX1
PKNOX2
PKP1
PKP2
PKP4
PLA1A
PLA2G10
PLA2G12A
PLA2G12B
PLA2G15
PLA2G16
PLA2G1B
PLA2G2A
PLA2G2C
PLA2G2D
PLA2G2E
PLA2G3
PLA2G4A
PLA2G4B
PLA2G4C
PLA2G4D
PLA2G4E
PLA2G4F
PLA2G6
PLA2G7
PLA2R1
PLAC8
PLAC8L1
PLAC9
PLAGL1
PLAGL2
PLAT
PLAU
PLAUR
PLB1
PLBD1
PLBD2
PLCB1
PLCB2
PLCB3
PLCB4
PLCD1
PLCD3
PLCD4
PLCE1
PLCG1
PLCG2
PLCH1
PLCH2
PLCL1
PLCL2
PLCXD2
PLCXD3
PLCZ1
PLD1
PLD2
PLD3
PLD4
PLD5
PLD6
PLEC
PLEK2
PLEKHA1
PLEKHA2
PLEKHA3
PLEKHA4
PLEKHA5
PLEKHA6
PLEKHA7
PLEKHA8
PLEKHB1
PLEKHB2
PLEKHF1
PLEKHF2
PLEKHG1
PLEKHG2
PLEKHG3
PLEKHG4
PLEKHG4B
PLEKHG6
PLEKHG7
PLEKHH1
PLEKHH2
PLEKHH3
PLEKHJ1
PLEKHM1
PLEKHM2
PLEKHM3
PLEKHN1
PLEKHO1
PLEKHO2
PLEKHS1
PLET1
PLG
PLGLB1


RIMKLB
RIMS1
RIMS2
RIMS3
RIMS4
RIN1
RIN2
RINL
RINT1
RIOK1
RIOK2
RIOK3
RIPK1
RIPK2
RIPK3
RIPPLY2
RIT1
RIT2
RITA1
RLBP1
RLF
RLN1
RLN2
RLN3
RLTPR
RMDN1
RMDN2
RMI1
RMI2
RMND1
RMND5A
RMND5B
RNASE1
RNASE10
RNASE11
RNASE12
RNASE2
RNASE3
RNASE4
RNASE6
RNASE7
RNASE8
RNASEH1
RNASEH2A
RNASEH2B
RNASEH2C
RNASEK
RNASEL
RNASET2
RND1
RND2
RNF10
RNF103
RNF11
RNF112
RNF113B
RNF114
RNF115
RNF121
RNF122
RNF123
RNF125
RNF126
RNF13
RNF130
RNF133
RNF135
RNF138
RNF139
RNF14
RNF141
RNF144A
RNF144B
RNF146
RNF148
RNF149
RNF150
RNF151
RNF152
RNF157
RNF165
RNF166
RNF168
RNF169
RNF17
RNF170
RNF175
RNF180
RNF182
RNF183
RNF186
RNF187
RNF19A
RNF19B
RNF2
RNF207
RNF208
RNF212
RNF213
RNF214
RNF215
RNF217
RNF219
RNF220
RNF222
RNF223
RNF224
RNF24
RNF25
RNF26
RNF32
RNF38
RNF4
RNF40
RNF41
RNF43
RNF44
RNF6
RNF7
RNF8
RNFT1
RNFT2
RNGTT
RNH1
RNLS
RNMT
RNMTL1
RNPC3
RNPEP
RNPEPL1
RNPS1
ROBO1
ROBO3
ROBO4
ROCK1
ROCK2
Processed:  %73.0
ROGDI
ROM1
ROPN1
ROPN1B
ROPN1L
ROR2
RORA
RORB
RORC
ROS1
RP1
RP1L1
RP9
RPA1
RPA2
RPA3
RPA3-AS1
RPAIN

SLCO2B1
SLCO3A1
SLCO4A1
SLCO4C1
SLCO5A1
SLCO6A1
SLFN11
SLFN12
Processed:  %80.0
SLFN12L
SLFN13
SLFN14
SLFN5
SLFNL1
SLIRP
SLIT1
SLIT2
SLITRK3
SLITRK6
SLK
SLMAP
SLMO1
SLMO2
SLN
SLPI
SLU7
SLURP1
SLX1B
SLX4
SLX4IP
SMAD2
SMAD3
SMAD5
SMAD7
SMAD9
SMAGP
SMAP1
SMAP2
SMARCA2
SMARCA4
SMARCAD1
SMARCAL1
SMARCB1
SMARCC1
SMARCC2
SMARCD1
SMARCD3
SMARCE1
SMC1B
SMC2
SMC3
SMC4
SMC6
SMCHD1
SMCO1
SMCO2
SMCO4
SMCP
SMCR8
SMDT1
SMEK2
SMG1
SMG5
SMG6
SMG7
SMG8
SMG9
SMIM1
SMIM11
SMIM12
SMIM14
SMIM15
SMIM17
SMIM18
SMIM19
SMIM2
SMIM20
SMIM21
SMIM22
SMIM23
SMIM3
SMIM5
SMIM6
SMIM7
SMIM8
SMKR1
SMLR1
SMN1
SMN2
SMO
SMOC1
SMOC2
SMOX
SMPD2
SMPD4
SMPDL3A
SMPDL3B
SMR3A
SMR3B
SMTNL1
SMU1
SMUG1
SMURF2
SMYD1
SMYD2
SMYD3
SMYD4
SMYD5
SNAI1
SNAI3
SNAP25
SNAP29
SNAP47
SNAP91
SNAPC1
SNAPC2
SNAPC3
SNAPC4
SNAPC5
SNAPIN
SNCA
SNCAIP
SNCB
SND1
SNED1
SNF8
SNIP1
SNN
SNPH
SNRK
SNRNP200
SNRNP25
SNRNP27
SNRNP35
SNRNP48
SNRNP70
SNRPA
SNRPA1
SNRPB
SNRPB2
SNRPC
SNRPD1
SNRPD2
SNRPE
SNRPF
SNRPG
SNRPN
SNTA1
SNTB1
SNTB2
SNTG1
SNTG2
SNUPN
SNW1
SNX1

TMEM184B
TMEM184C
TMEM185B
TMEM186
TMEM189
TMEM19
TMEM190
TMEM192
TMEM194A
TMEM194B
TMEM196
TMEM198
TMEM199
TMEM2
TMEM200A
TMEM200B
TMEM200C
TMEM202
TMEM203
TMEM204
TMEM205
TMEM206
TMEM208
TMEM209
TMEM210
TMEM211
TMEM212
TMEM213
TMEM215
TMEM216
TMEM217
TMEM218
TMEM219
TMEM220
TMEM221
TMEM223
TMEM225
TMEM229A
TMEM229B
TMEM230
TMEM231
TMEM232
TMEM233
TMEM234
TMEM235
TMEM236
TMEM237
TMEM238
TMEM239
TMEM240
TMEM241
TMEM242
TMEM243
TMEM244
TMEM245
TMEM247
TMEM248
TMEM249
TMEM25
TMEM251
TMEM252
TMEM253
TMEM254
TMEM255B
TMEM256
TMEM256-PLSCR3
TMEM258
TMEM259
TMEM26
TMEM260
TMEM262
TMEM30B
TMEM30C
TMEM33
TMEM37
TMEM38A
TMEM38B
TMEM39A
TMEM40
TMEM41A
TMEM41B
TMEM42
TMEM43
TMEM44
TMEM45A
TMEM45B
TMEM5
TMEM50A
TMEM50B
TMEM51
TMEM52
TMEM52B
TMEM53
TMEM54
TMEM55A
TMEM55B
TMEM56
TMEM57
TMEM59L
TMEM60
TMEM61
TMEM63A
TMEM63B
TMEM63C
TMEM64
TMEM65
TMEM67
Processed:  %88.0
TMEM68
TMEM69
TMEM70
TMEM71
TMEM72
TMEM74
TMEM74B
TMEM75
TMEM78
TMEM79
TMEM80
TMEM81
TMEM82
TMEM86A
TMEM86B
TMEM87A
TMEM87B
TMEM88
T

YTHDF2
YTHDF3
YWHAB
YWHAE
YWHAH
YWHAQ
YWHAZ
YY1
YY1AP1
ZACN
ZADH2
ZAP70
ZAR1L
ZBBX
ZBED2
ZBED3
ZBED4
ZBED6
ZBED6CL
ZBP1
ZBTB1
ZBTB10
ZBTB11
ZBTB16
ZBTB18
ZBTB20
ZBTB21
ZBTB24
ZBTB25
ZBTB26
ZBTB3
ZBTB34
ZBTB37
ZBTB39
ZBTB4
ZBTB40
ZBTB41
ZBTB42
ZBTB43
ZBTB44
ZBTB45
ZBTB46
ZBTB47
ZBTB48
ZBTB6
ZBTB7A
ZBTB7B
ZBTB7C
ZBTB8OS
ZC2HC1A
ZC2HC1B
ZC2HC1C
ZC3H11A
ZC3H12A
ZC3H12C
ZC3H12D
ZC3H13
ZC3H14
ZC3H15
Processed:  %96.0
ZC3H18
ZC3H3
ZC3H6
ZC3H7A
ZC3H7B
ZC3H8
ZC3HAV1
ZC3HAV1L
ZC3HC1
ZCCHC11
ZCCHC2
ZCCHC24
ZCCHC3
ZCCHC4
ZCCHC6
ZCCHC7
ZCCHC8
ZCCHC9
ZCRB1
ZCWPW1
ZDBF2
ZDHHC11
ZDHHC11B
ZDHHC12
ZDHHC13
ZDHHC14
ZDHHC16
ZDHHC17
ZDHHC18
ZDHHC19
ZDHHC2
ZDHHC20
ZDHHC22
ZDHHC23
ZDHHC24
ZDHHC4
ZDHHC5
ZDHHC6
ZDHHC7
ZDHHC8
ZEB2
ZER1
ZFAND1
ZFAND2A
ZFAND2B
ZFAND4
ZFAND5
ZFAND6
ZFC3H1
ZFHX2
ZFHX3
ZFHX4
ZFP1
ZFP14
ZFP2
ZFP28
ZFP30
ZFP36
ZFP36L1
ZFP36L2
ZFP37
ZFP41
ZFP42
ZFP62
ZFP64
ZFP69
ZFP69B
ZFP82
ZFP90
ZFP91
ZFPL1
ZFPM2
ZFR
ZFR2
ZFYVE1
ZFYVE16
ZFYVE19
ZFYVE20
ZFYVE21
ZFYVE26
ZFYVE27
ZFYVE28
ZG16B
ZGLP1
ZGPAT

In [19]:
gene_ensg.index("KRTAP16-1")


7164

In [20]:
len(gene_ensg)

15605

In [21]:
gene = "KRTAP16-1"

In [29]:
gene = "KRTAP16-1"
t = gene_ensg.index("KRTAP16-1")
filename = cov_dir + "/"+ gene + ".snplist"
snp_rsid = pd.read_table(filename, header = None)
snp_rsid = list(snp_rsid.loc[:,0])

#matrix of weights
M = len(snp_rsid) #number of snps
logging.info("Number of SNPs: " + str(M))
weights = np.zeros(shape = (M, N))
for i in range(N):
    #logging.info("Database: " + str(i+1))
    dbname = fi[i]
    conn = create_connection(dbname)
    cur = conn.cursor()  
    sql_q = 'select * from weights where gene = "' + gene + '"'
    tmp_query = cur.execute(sql_q).fetchall()
    rsid_in_db = list(map(lambda x: str(x[0]), tmp_query))
    #rsid_in_db = map(lambda x: str(x[0]), tmp_query)
    index = match_list(rsid_in_db, snp_rsid)
    indi = index[index > -1]
    # extract the weight
    sql_q = 'select * from weights where gene = "' + gene + '"'
    tmp_query = cur.execute(sql_q).fetchall()
    tmp_weights = np.array(list(map(lambda x: str(x[2]), tmp_query)))
    #tmp_weights = np.array(map(lambda x: str(x[2]), tmp_query))
    if sum(index > -1) > 0:
        weights[indi,i] = tmp_weights[index > -1]

# covariance matrix of snps
cov_file = cov_dir + "/" + gene + ".cov"
cov_matrix = np.loadtxt(cov_file)

# covariance matrix of gene in different tissue
cov_gene = np.mat(weights.T) * np.mat(cov_matrix) * np.mat(weights)
cov_gene = np.array(cov_gene)

# normalization
for i in range(N):
    if cov_gene[i,i] != 0:
        cov_gene[i,:] = cov_gene[i,:] / np.sqrt(cov_gene[i,i])
        cov_gene[:,i] = cov_gene[:,i] / cov_gene[i,i]

#z-score of gene in different tissue
zscore_gene = np.full([N, 1], np.nan)   
for i in range(N):
    nam = "zscore_" + str(i+1)
    index = zscore_dict[nam]["gene"] == gene
    if sum(index) > 0:
        zscore_gene[i] = zscore_dict[nam]["zscore"][index].values[0]
        #p-value
        outcome.loc[k, (i+4)] = float(zscore_dict[nam]["pvalue"][index].values[0])

#only keep tissues with prediction model for gene
index = np.isnan(zscore_gene) == False
indext = index.T[0]
if sum(index) > 0:
    zscore_gene = zscore_gene[index]
    cov_gene = cov_gene[indext,:][:,indext]
if not np.allclose(cov_gene,cov_gene.T):
    print(gene)

ValueError: shapes (44,357) and (34,34) not aligned: 357 (dim 1) != 34 (dim 0)