## [ Indonesian Rice SNPs - Information Content Pre-analysis ]
`Prepared by` <b>Nicholas Dominic, S.Kom., M.T.I.</b>

### Define libraries

In [1]:
from pandas import read_csv, DataFrame
from os import listdir

### Load raw SNPs data

In [2]:
raw_snps = read_csv("data/ind-rg.csv", index_col=0)
raw_snps

Unnamed: 0,chr,id,posg,pos,ref,alt
1,1,id1000223,0,422620,T,G
2,1,id1001073,0,1174585,G,A
3,1,id1001973,0,2568541,T,C
4,1,id1002158,0,2725468,T,C
5,1,id1002308,0,2899077,C,T
...,...,...,...,...,...,...
1393,12,id12009153,0,25558936,A,G
1394,12,id12009407,0,26057332,C,T
1395,12,id12009654,0,26360367,A,G
1396,12,id12009820,0,26873949,T,C


### Get SVIF result path and files

In [3]:
RESULT_PATH = "results/svif/"

In [4]:
result = [RESULT_PATH+x for x in listdir(RESULT_PATH) if x.endswith(".csv")]
{file_num : r for file_num, r in enumerate(result)}

{0: 'results/svif/chr-10_(thresh-2.5).csv',
 1: 'results/svif/chr-11_(thresh-2.5).csv',
 2: 'results/svif/chr-12_(thresh-2.5).csv',
 3: 'results/svif/chr-1_(thresh-2.5).csv',
 4: 'results/svif/chr-2_(thresh-2.5).csv',
 5: 'results/svif/chr-3_(thresh-2.5).csv',
 6: 'results/svif/chr-4_(thresh-2.5).csv',
 7: 'results/svif/chr-5_(thresh-2.5).csv',
 8: 'results/svif/chr-6_(thresh-2.5).csv',
 9: 'results/svif/chr-7_(thresh-2.5).csv',
 10: 'results/svif/chr-8_(thresh-2.5).csv',
 11: 'results/svif/chr-9_(thresh-2.5).csv'}

### Declare neccessary function

In [5]:
def analyze_pos(select_file_num : int = 0, *args, **kwargs) -> DataFrame:
    chr_num = int(result[select_file_num].split("chr-")[1].split("_")[0])
    print("Analyzing SVIF result from chromosome {} ...".format(chr_num))
    
    selected_chr = raw_snps[raw_snps.chr == chr_num]
    id_ref = list(map('_'.join, zip(selected_chr.id.values.tolist(), selected_chr.ref.values.tolist())))
    selected_chr.insert(len(selected_chr.columns), "id_ref", id_ref)
    
    svif_result = read_csv(result[select_file_num])
    return svif_result.merge(selected_chr, left_on="SNP", right_on="id_ref")[["SNP", "agg_count", "pos"]]

### Get SNP position

In [6]:
select_file_num = 3 # change here, must be INT
analyze_pos(select_file_num).head(10)

Analyzing SVIF result from chromosome 1 ...


Unnamed: 0,SNP,agg_count,pos
0,TBGI040340_T,5,24597159
1,TBGI052052_G,5,31851695
2,TBGI049455_G,5,30423222
3,TBGI048143_G,5,29839335
4,TBGI048670_T,5,30094526
5,id1015931_A,5,29313930
6,TBGI044811_A,5,28000988
7,TBGI044294_A,5,27505528
8,id1012666_A,5,23995074
9,TBGI046808_C,5,28971365


In [7]:
select_file_num = 0 # change here, must be INT
analyze_pos(select_file_num)

Analyzing SVIF result from chromosome 10 ...


Unnamed: 0,SNP,agg_count,pos
0,id10002180_C,5,6543844
1,id10002406_A,5,7254303
2,TBGI406425_A,5,14570693
3,TBGI406289_G,5,14507149
4,id10002660_G,5,9918841
5,id10002329_A,5,7002473
6,id10003462_G,5,13539364
7,TBGI405706_A,5,13960740
8,TBGI405262_C,5,13379366
9,id10000350_G,4,1657153


### Copyright 2022 Authors
* Do <b>NOT</b> use or redistribute this file <b>EXCEPT</b> there is an official permission from authors.
* Please cite our paper whenever you take references from this research.
* Any questions can be addressed to `nicholas.dominic@binus.ac.id` or `bdsrc@binus.edu`.

Bioinformatics and Data Science Research Center [(visit page)](https://research.binus.ac.id/bdsrc/)
<br>NVIDIA - BINUS Artificial Intelligence Research and Development Center [(visit page)](https://research.binus.ac.id/airdc/)
<br><b>BINUS Higher Education</b>, part of BINUS Group [(visit page)](https://www.binus.edu/) [(visit QS page)](https://www.topuniversities.com/universities/bina-nusantara-university-binus)