# Предварительный этап.

### Шаг 1. Подготовить файл с набором SNPs (FP_SNPs_10k_GB38_twoAllelsFormat.tsv).

1. Скачиваем и извлекаем из архива файл **FP_SNPs.txt** по схеме, представленной в пояснительной части тестового задания.
2. Создаём шаблон финального файла **FP_SNPs_10k_GB38_twoAllelsFormat.tsv** и добавляем в него правильные заголовки:

In [None]:
echo -e "#CHROM\tPOS\tID\tallele1\tallele2" > FP_SNPs_10k_GB38_twoAllelsFormat.tsv

3. Преобразуем оригинальный файл FP_SNPs.txt и записываем скорректированные данные в финальный файл:

In [None]:
awk '{print $2, $4, $1, $5, $6}' FP_SNPs.txt | awk '{print $1="chr"$1, $2, $3="rs"$3, $4, $5}' | head -n -1000 | sed '1d' | tr ' ' '\t' >> FP_SNPs_10k_GB38_twoAllelsFormat.tsv 
# select desired columns -> add prefixes (chr, rs) -> remove last 1000 rows (chr23) -> remove 1st line (column names) -> replace space-delimination to TABs ->> write modified table into final file.


### Шаг  2. Подготовить отдельные референсы для каждой хромосомы.

In [None]:
# pseudocode
for i in (1..22, X, Y, M); do
	go to GRCh38.d1.vd1.fa
	select lines related to header (chr$i) and up to the next chromosome
	write lines into a new file 'chr$i.fai'
	do indexing of 'chr$i.fai'
done

# Основной этап.

## Запуск скрипта:

In [None]:
findAlternative.py -i input_tsv_table -o output_filename 

## Содержимое скрипта:

### Порверка аргументов командной строки:

In [None]:
print "[$current_time_and_date]: Run the script." -> add to log file

In [None]:
print "[$current_time_and_date]: Check agguments." -> add to log file
1. input and output arguments are not empty.
    if $input_tsv_table OR $output_filename is empty:
        ptint "Error: missing argument for Input or Output. Please, add tsv table as an input argument and write output file name"
        add error message in log file
        return help desctiption
        break
    else
        add to log file: "Test 1/4: OK."
        continue

2. $input_tsv_table has the right extention (tsv)
    if $input_tsv_table has another extention:
        ptint "Error: wrong extention for Input table. Please, change it to tsv or use another file"
        add error message in log file
        return help desctiption
        break
    else
        add to log file: "Test 2/4: OK."
        continue

3. $input_tsv_table file structure is correct (it has correct header)
    if header is not correct:
        ptint "Error: $input_tsv_table seems wrong. Please, change it."
        add error message in log file
        return help desctiption
        break
    else
        add to log file: "Test 3/4: OK."
        continue

4. Check that reference genomes (and indexes) are available (by default all references are in the path: /ref/GRCh38.d1.vd1_mainChr/sepChrs/)
    for i in (1..22); do
    	str ChromosomeNumber = 'chr' + $i # make a string like 'chr1'
        check $ChromosomeNumber.fai is available in the path
            if not: 
                print: "Error: couldn't find reference genome or index files for $ChromosomeNumber. Pleace, check references and run the script again"
                add error message in log file
                return help desctiption
                break
            if yes: 
                continue
    done
    add to log file: "Test 4/4: OK."

print "[$current_time_and_date]: Arguments and references are correct! Continue analysis." -> add to log file

### Выполнение основной части скрипта:

In [None]:
Create an OutputTable: $output_filename + '.tsv' extention
for i in (1..22); do
    print "[$current_time_and_date]: Work with the chromosome $i." -> add to log file
	str ChromosomeNumber = 'chr' + $i # make a string like 'chr1'
	FP_SNPs_$ChromosomeNumber.tsv # create a temporary table 'FP_SNPs_$ChromosomeNumber.tsv' for one chromosome and copy rows related to $ChromosomeNumber from $input_tsv_table to 'FP_SNPs_$ChromosomeNumber.tsv'
	int RowsNumber = count number of rows in FP_SNPs_$ChromosomeNumber.tsv
	int ErrorRowsNumber = 0 # will be used to count SNPs where we couldn't define reference allele
    str RefGenome = .../$ChromosomeNumber.fai # go to /ref/GRCh38.d1.vd1_mainChr/sepChrs/ (by fefault) and select one referehce related to $ChromosomeNumber
    
	for line in (1..$RowsNumber); do
		int POS = ... # select POS on $line from FP_SNPs_$ChromosomeNumber.tsv
		str Allele1 = ... # select allele1 on $line from FP_SNPs_$ChromosomeNumber.tsv
		str Allele2 = ... # select allele2 on $line from FP_SNPs_$ChromosomeNumber.tsv
		str ID = ... # select ID on $line from FP_SNPs_$ChromosomeNumber.tsv

        str RefNucl = ... # select $RefGenome nucleotide on position $POS
		
		# Для каждой позиции $POS из FP_SNPs_$ChromosomeNumber.tsv
        # последовательно сравниваем нуклеотид на этой позиции в референсе $RefNucl с $Allele1 и $Allele2
		
		if $RefNucl == $Allele1 
    		str REF = $Allele1 
    		str ALT = $Allele2
    		Записываем в $OutputTable строку, содержащую $ChromosomeNumber, $POS, $ID, $REF, $ALT
		elif $RefNucl == $Allele2
    		str REF = $Allele2
    		str ALT = $Allele1
    		Записываем в $OutputTable строку, содержащую $ChromosomeNumber, $POS, $ID, $REF, $ALT
		else # $RefNucl doesn't match any of alleles from the table
    		Add to log: "ERROR: reference nucleotide on the $ChromosomeNumber position $POS differs from the input table ($input_tsv_table) \
                            Nucleotide on the reference: $RefNucl; in the table: allele1 - $Allele1, allele2 -  $Allele2"
            $ErrorRowsNumber += 1
	done

	print "[$current_time_and_date]: $ChromosomeNumber: ($RowsNumber - $ErrorRowsNumber) out of $RowsNumber SNP positions were recognised. \
                Some problems occured for $ErrorRowsNumber SNP positions." -> add to log file
	rm FP_SNPs_$ChromosomeNumber.tsv

done

### Сообщение, выдаваемое скриптом для описания себя и своих аргументов по запросу (-h, --help)

In [None]:
findAlternative.py -i input_table.tsv -o output_filename

##### Options:

In [None]:
-h, --help          show this help message and exit
-i INPUT, --input INPUT
                    input table with FP SNPs. Should be in tsv format.
-o OUTPUT, --output OUTPUT
                    name for output file. No extention needed!
-r path/to/refs, --ref path/to/refs
                    path to folder with reference genomes (default: /ref/GRCh38.d1.vd1_mainChr/sepChrs/)

By default, the script utilizes prepared in advance individual references for each chromosome. These references and indexes are located in the folder:
/ref/GRCh38.d1.vd1_mainChr/sepChrs/