# Genome anchoring using synteny and linkage map

* ms figure 1. Distribution of SNP markers from the phased parental linkage map (Montanari et al., 2022) on the four V06.A002-186 contig sets of chromosome 2.
  
## 1. mapping haplotype contigs to reference genome

In [2]:
WDR=002.phasingFromGeneticMap

In [1]:
REF=VcaeV1.3.p0.fa

In [29]:
ml samtools/1.16
samtools faidx $REF

In [5]:
HAPS=/workspace/hraijc/Blueberry/Blueberry_trio/Assembly/postcontamremoval/

In [19]:
CONTIG_SETS="classified_M7_plus_unclassified_hap1 classified_M7_plus_unclassified_hap2 classified_Nui_plus_unclassified_hap1 classified_Nui_plus_unclassified_hap2"

for i in $CONTIG_SETS; do

sbatch << EOF
#!/bin/bash
#SBATCH -J mapping
#SBATCH -o ${WDR}/mapping_per_hap.out
#SBATCH -e ${WDR}/mapping_per_hap.err
#SBATCH --cpus-per-task=16
#SBATCH --mem=20G
#SBATCH --time=10:00:00

module load minimap2/2.22
ml samtools/1.16

minimap2 -t 16 -ax asm10 $REF $HAPS/$i.clean.fa > $WDR/$i.clean.sam
samtools view -@ 16 -b $WDR/$i.clean.sam > $WDR/$i.clean.bam
samtools sort -@ 16 $WDR/$i.clean.bam > $WDR/$i.clean.sorted.bam
samtools index -@ 16 $WDR/$i.clean.sorted.bam

EOF

done

Submitted batch job 3709745
Submitted batch job 3709746
Submitted batch job 3709747
Submitted batch job 3709748


In [20]:
ml bcftools/1.16


About:   SNP/indel variant calling from VCF/BCF. To be used in conjunction with bcftools mpileup.
         This command replaces the former "bcftools view" caller. Some of the original
         functionality has been temporarily lost in the process of transition to htslib,
         but will be added back on popular demand. The original calling model can be
         invoked with the -c option.
Usage:   bcftools call [options] <in.vcf.gz>

File format options:
       --no-version                Do not append version and command line to the header
   -o, --output FILE               Write output to a file [standard output]
   -O, --output-type b|u|z|v       Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]
   -O, --output-type u|b|v|z[0-9]  u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]
       --ploidy ASSEMBLY[?]        Predefined ploidy, 'list' to print available settings, append '?' for details [2]
       --p

: 255

In [16]:
## variant calling from samtools

CONTIG_SETS="classified_M7_plus_unclassified_hap1 classified_M7_plus_unclassified_hap2 classified_Nui_plus_unclassified_hap1 classified_Nui_plus_unclassified_hap2"

for i in $CONTIG_SETS; do

sbatch << EOF
#!/bin/bash
#SBATCH -J calling
#SBATCH -o ${WDR}/calling_per_hap.allcalls.out
#SBATCH -e ${WDR}/calling_per_hap.allcalls.err
#SBATCH --cpus-per-task=1
#SBATCH --mem=20G
#SBATCH --time=10:00:00

ml bcftools/1.16

bcftools mpileup -Ou -f $WDR/VcaeV1.3.p0.fa $WDR/$i.clean.sorted.bam | bcftools call --ploidy 1 -V -c -Ov -o $WDR/$i.clean.allcalls.sorted.vcf

EOF

done

Submitted batch job 3772207
Submitted batch job 3772208
Submitted batch job 3772209
Submitted batch job 3772210


In [34]:
CONTIG_SETS="classified_M7_plus_unclassified_hap1 classified_M7_plus_unclassified_hap2 classified_Nui_plus_unclassified_hap1 classified_Nui_plus_unclassified_hap2"

for i in $CONTIG_SETS; do

sbatch << EOF
#!/bin/bash
#SBATCH -J calling
#SBATCH -o ${WDR}/calling_bgzip.out
#SBATCH -e ${WDR}/calling_bgzip.err
#SBATCH --cpus-per-task=1
#SBATCH --mem=20G
#SBATCH --time=10:00:00

ml bcftools/1.16

bcftools view $WDR/$i.clean.sorted.vcf -Oz -o $WDR/$i.clean.sorted.vcf.gz
bcftools index $WDR/$i.clean.sorted.vcf.gz

EOF

done

Submitted batch job 3710041
Submitted batch job 3710042
Submitted batch job 3710043
Submitted batch job 3710044


In [35]:
sbatch << EOF
#!/bin/bash
#SBATCH -J calling
#SBATCH -o ${WDR}/calling_merge.out
#SBATCH -e ${WDR}/calling_merge.err
#SBATCH --cpus-per-task=1
#SBATCH --mem=20G
#SBATCH --time=10:00:00

ml bcftools/1.16

bcftools merge -Ov $WDR/classified_M7_plus_unclassified_hap1.clean.sorted.vcf.gz $WDR/classified_M7_plus_unclassified_hap2.clean.sorted.vcf.gz $WDR/classified_Nui_plus_unclassified_hap1.clean.sorted.vcf.gz $WDR/classified_Nui_plus_unclassified_hap2.clean.sorted.vcf.gz > $WDR/merged.vcf


EOF

Submitted batch job 3710045


## 2. extract variants called from genetic map positions from merged.vcf

In [5]:
# genetic map positions obtained from previous study: "Montanari S, Thomson S, Cordiner S, Günther CS, Miller P, Deng CH, McGhie T, Knäbel M, Foster T, Turner J, Chagné D, Espley R. High-density linkage map construction in an autotetraploid blueberry population and detection of quantitative trait loci for anthocyanin content. Front Plant Sci. 2022 Sep 23;13:965397. doi: 10.3389/fpls.2022.965397. PMID: 36247546; PMCID: PMC9555082."

head $WDR/geneticMapPositions.chr2.txt

seq-0-002_1002949
seq-0-002_10212141
seq-0-002_10212213
seq-0-002_10212217
seq-0-002_10248216
seq-0-002_10260806
seq-0-002_10264525
seq-0-002_10328926
seq-0-002_10348643
seq-0-002_10348661


In [3]:
ml pfr-python3/3.9.13

In [14]:
python << EOF

with open('geneticMapPositions.chr2.txt','r') as f:
	pos_list = [line.strip() for line in f]

with open('merged.vcf', 'r') as f_vcf:
	header_lines = sum(1 for line in f_vcf if line.startswith('#'))
	f_vcf.seek(0)  # Return to the beginning of the file
	vcf_lines = [line.strip().split('\t') for line in f_vcf if '#' not in line]

poscalls_list = [f'{line[0]}_{line[1]}' for line in vcf_lines]
geno_lists = [[line[i] for line in vcf_lines] for i in range(9, 13)]

with open('progeny_phasingTable.chr2.txt', 'w') as f_table:
    for pos in pos_list:
        if pos in poscalls_list:
            indexcalls = poscalls_list.index(pos)
            geno_values = '\t'.join(geno_lists[i][indexcalls] for i in range(4))
            f_table.write(f'{pos}\t{geno_values}\n')
        else:
            f_table.write(f'{pos}\tNaN\tNaN\tNaN\tNaN\n')


EOF

with open('geneticMapPositions.chr1.txt','r') as f:
	pos_list = [line.strip() for line in f]

with open('merged.vcf', 'r') as f_vcf:
	header_lines = sum(1 for line in f_vcf if line.startswith('#'))
	f_vcf.seek(0)  # Return to the beginning of the file
	vcf_lines = [line.strip().split('\t') for line in f_vcf if '#' not in line]

poscalls_list = [f'{line[0]}_{line[1]}' for line in vcf_lines]
geno_lists = [[line[i] for line in vcf_lines] for i in range(9, 13)]

with open('progeny_phasingTable.txt', 'w') as f_table:
    for pos in pos_list:
        if pos in poscalls_list:
            indexcalls = poscalls_list.index(pos)
            geno_values = '\t'.join(geno_lists[i][indexcalls] for i in range(4))
            f_table.write(f'{pos}\t{geno_values}\n')
        else:
            f_table.write(f'{pos}\tNaN\tNaN\tNaN\tNaN\n')


In [5]:
# submit script to cluster as vcf is large, it takes long time to run

sbatch << EOF
#!/bin/bash
#SBATCH -J tableGenerate
#SBATCH -o ${WDR}/table_generate.chr2.out
#SBATCH -e ${WDR}/table_generate.chr2.err
#SBATCH --cpus-per-task=1
#SBATCH --mem=2G
#SBATCH --time=1:00:00

ml pfr-python3

cd $WDR

python phasingTableFromProgeny.py

EOF

Submitted batch job 4256342


In [6]:
# four columns are M7_hA, M7_hB, Nui_hA and Nui_hB

head $WDR/progeny_phasingTable.chr2.txt

seq-0-002_1002949	.:.	1:60,0	.:.	1:60,0
seq-0-002_10212141	NaN	NaN	NaN	NaN
seq-0-002_10212213	1:60,0	1:60,0	.:.	.:.
seq-0-002_10212217	.:.	.:.	1:60,0	1:60,0
seq-0-002_10248216	NaN	NaN	NaN	NaN
seq-0-002_10260806	NaN	NaN	NaN	NaN
seq-0-002_10264525	NaN	NaN	NaN	NaN
seq-0-002_10328926	1:60,0	.:.	.:.	.:.
seq-0-002_10348643	NaN	NaN	NaN	NaN
seq-0-002_10348661	1:60,0	.:.	.:.	.:.


* '.:.' could indicate 'missing data' or 'homozygous to reference allele'

## 3. extracting contig id with variants called at genetic map positions from 'bam' files

In [10]:
ml samtools/1.12
ml

Currently Loaded Modulefiles:
 1) [46mpowerPlant/core[0m    5) [46mSlurm/21.08.8-2[0m      9) pfr-python3/3.9.13  
 2) texlive/20230914   6) perlbrew/0.76       10) samtools/1.12       
 3) pandoc/1.19.2      7) perl/5.36.0         
 4) git/2.21.0         8) slurm-utils/latest  

Key:
[46msticky[0m  


In [11]:
cd $WDR

In [None]:
python << EOF

import subprocess


species = ['M7', 'Nui']
haps = ['hap1', 'hap2']

for spec in species:
	for hap in haps:
	# Assuming geneticMapPositions.txt has the chromosome and position information
		with open('geneticMapPositions.chr2.txt', 'r') as positions_file:
			for line in positions_file:
				chrom, pos = line.strip().split('_')
				command = f"samtools view classified_{spec}_plus_unclassified_{hap}.clean.sorted.bam {chrom}:{pos}-{pos} | cut -f1 | head -n 1"
				result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, text=True)
				
				# Append chrom, pos, and read IDs to 'read_ids.M7h2.txt'
				with open('mappingPositions.' + spec + '.' + hap + '.chr2.txt', 'a') as output_file:
					if result.stdout != '':
						output_file.write(f"{chrom}\t{pos}\t{result.stdout}")
					else:
						output_file.write(f"{chrom}\t{pos}\tNaN\n")


EOF

In [7]:
head $WDR/mappingPositions.M7.hap1.chr2.txt

seq-0-002	1002949	h1tg000119l_1
seq-0-002	10212141	h1tg000296l_1
seq-0-002	10212213	h1tg000296l_1
seq-0-002	10212217	h1tg000296l_1
seq-0-002	10248216	h1tg000296l_1
seq-0-002	10260806	h1tg000296l_1
seq-0-002	10264525	NaN
seq-0-002	10328926	h1tg000296l_1
seq-0-002	10348643	h1tg000296l_1
seq-0-002	10348661	h1tg000296l_1


* 'NaN' means 'missing data', using this information to distinguish missing data and alleles homozygous to reference

* sorting out to above outputs to generate a table containing allele information

In [12]:
# collumn infor: M7_h1,M7_h2,M7_h3,M7_h4,Nui_h1,Nui_h2,Nui_h3,Nui_h4,M7_hA,M7_hB,Nui_hA,Nui_hB

head $WDR/test/phasingFile.chr2.txt

seq-0-002_229781	0	0	0	0	0	0	1	0	0	0	0	1	NaN	h2tg000447l_1	h1tg000203l_1	h2tg000544l_1
seq-0-002_229801	0	0	0	0	0	0	1	0	0	0	0	1	NaN	h2tg000447l_1	h1tg000203l_1	h2tg000544l_1
seq-0-002_229820	0	0	0	0	0	0	1	0	0	0	0	1	NaN	h2tg000447l_1	h1tg000203l_1	h2tg000544l_1
seq-0-002_229830	0	0	0	0	0	0	1	0	0	0	0	1	NaN	h2tg000447l_1	h1tg000203l_1	h2tg000544l_1
seq-0-002_247149	0	0	0	0	0	0	1	0	0	0	0	1	h1tg000119l_1	h2tg000447l_1	h1tg000203l_1	h2tg000544l_1
seq-0-002_247160	0	1	0	0	0	0	0	0	0	1	0	0	h1tg000119l_1	h2tg000447l_1	h1tg000203l_1	h2tg000544l_1
seq-0-002_252157	0	1	0	0	0	0	0	0	0	1	0	0	h1tg000119l_1	h2tg000447l_1	h1tg000203l_1	h2tg000544l_1
seq-0-002_268324	0	1	0	0	0	0	1	0	0	1	0	1	h1tg000119l_1	h2tg000447l_1	h1tg000203l_1	h2tg000544l_1
seq-0-002_297604	0	0	0	0	0	0	1	0	0	0	0	1	h1tg000119l_1	h2tg000447l_1	h1tg000203l_1	h2tg000544l_1
seq-0-002_299184	0	0	0	0	1	1	0	0	1	1	0	1	h1tg000119l_1	h2tg000447l_1	h1tg000203l_1	h2tg000544l_1


## 4. generate simplex alleles that inherit to progeny

In [None]:
python << EOF

import numpy as np
from collections import Counter

file_path = 'phasingFile.chr2.txt'  # Replace with the actual file path

# Specify the column index (0-based) or name
column_index = 1  # Replace with the actual column index or name

# Read the specified column into a list
try:
    with open(file_path, 'r') as file:
        lines = [line.strip().split('\t') for line in file]
        marker_id = [line[0] if line[0] != '' else np.nan for line in lines]
        M7h1_contigs_values = [line[13] if line[13] != '' else np.nan for line in lines]
        M7h2_contigs_values = [line[14] if line[14] != '' else np.nan for line in lines]
        M7_h1_values = [line[1] if line[1] != '' else np.nan for line in lines]
        M7_h2_values = [line[2] if line[2] != '' else np.nan for line in lines]
        M7_h3_values = [line[3] if line[3] != '' else np.nan for line in lines]
        M7_h4_values = [line[4] if line[4] != '' else np.nan for line in lines]
        M7h1_values = [line[9] if line[9] != '' else np.nan for line in lines]
        M7h2_values = [line[10] if line[10] != '' else np.nan for line in lines]
    M7h1_contigs_values_uniq = list(set(M7h1_contigs_values))
    M7h2_contigs_values_uniq = list(set(M7h2_contigs_values))
except FileNotFoundError:
    print(f"File not found: {file_path}")
except IndexError:
    print(f"Column index out of range: {column_index}")


for unique_value in M7h1_contigs_values_uniq:
    matching_indices = [index for index, value in enumerate(M7h1_contigs_values) if value == unique_value]
    print(f"Indices of values equal to {unique_value}: {matching_indices}")
    corresponding_values_M7_h1 = [M7_h1_values[index] for index in matching_indices]
    print(f"Corresponding M7_h1 Values: {corresponding_values_M7_h1}\n")
    corresponding_values_M7_h2 = [M7_h2_values[index] for index in matching_indices]
    print(f"Corresponding M7_h2 Values: {corresponding_values_M7_h2}\n")
    corresponding_values_M7_h3 = [M7_h3_values[index] for index in matching_indices]
    print(f"Corresponding M7_h3 Values: {corresponding_values_M7_h3}\n")
    corresponding_values_M7_h4 = [M7_h4_values[index] for index in matching_indices]
    print(f"Corresponding M7_h4 Values: {corresponding_values_M7_h4}\n")
    corresponding_values_M7h1 = [M7h1_values[index] for index in matching_indices]
    print(f"Corresponding M7h1 Values: {corresponding_values_M7h1}\n")
    corresponding_values_M7h2 = [M7h2_values[index] for index in matching_indices]
    print(f"Corresponding M7h2 Values: {corresponding_values_M7h2}\n")
    
    total_values_num = len(corresponding_values_M7h1)
#    print(total_values_num)

    print(len(corresponding_values_M7h1))

    frequency_counts = Counter(corresponding_values_M7h1)
    for value, count in frequency_counts.items():
        print(f"M7h1 value: {value}, Frequency: {count}")

     # Check if M7h1 value is '1' before printing corresponding values
    for index in matching_indices:
        output_file_path = 'test_M7h1.chr2.txt'
        if M7h1_values[index] == '1' and M7h2_values[index] != '1':
            print(f"Index: {index}")
            corresponding_values_M7_h1 = M7_h1_values[index]
            print(f"Corresponding M7_h1 Value: {corresponding_values_M7_h1}")
            corresponding_values_M7_h2 = M7_h2_values[index]
            print(f"Corresponding M7_h2 Value: {corresponding_values_M7_h2}")
            corresponding_values_M7_h3 = M7_h3_values[index]
            print(f"Corresponding M7_h3 Value: {corresponding_values_M7_h3}")
            corresponding_values_M7_h4 = M7_h4_values[index]
            print(f"Corresponding M7_h4 Value: {corresponding_values_M7_h4}")
            corresponding_values_M7h1 = M7h1_values[index]
            print(f"Corresponding M7h1 Value: {corresponding_values_M7h1}")
            corresponding_values_M7h2 = M7h2_values[index]
            print(f"Corresponding M7h2 Value: {corresponding_values_M7h2}\n")
            
            corresponding_values_dict = {
                "corresponding_values_M7_h1": corresponding_values_M7_h1,
                "corresponding_values_M7_h2": corresponding_values_M7_h2,
                "corresponding_values_M7_h3": corresponding_values_M7_h3,
                "corresponding_values_M7_h4": corresponding_values_M7_h4,
            }

            print(f"corresponding values dict: {corresponding_values_dict}\n")

            largest_value = max(corresponding_values_dict.values())
            print(f"largest value: {largest_value}\n")
        # Find all columns with the largest value
            largest_columns = [column for column, variant in corresponding_values_dict.items() if variant == largest_value]
            print(f"largest columns: {largest_columns}\n")

            if largest_value == '1' and len(largest_columns) < 2:
                with open(output_file_path, 'a') as output_file:
                    output_file.write(f"{unique_value}\t{marker_id[index]}\t{largest_columns[0].split('_')[-1].split(']')[0]}\n")
            
            else:
                with open(output_file_path, 'a') as output_file:
                    output_file.write(f"{unique_value}\t{marker_id[index]}\tNaN\n")

        else:
            with open(output_file_path, 'a') as output_file:
                output_file.write(f"{unique_value}\t{marker_id[index]}\tNaN\n")



for unique_value in M7h2_contigs_values_uniq:
    matching_indices = [index for index, value in enumerate(M7h2_contigs_values) if value == unique_value]
    print(f"Indices of values equal to {unique_value}: {matching_indices}")
    corresponding_values_M7_h1 = [M7_h1_values[index] for index in matching_indices]
    print(f"Corresponding M7_h1 Values: {corresponding_values_M7_h1}\n")
    corresponding_values_M7_h2 = [M7_h2_values[index] for index in matching_indices]
    print(f"Corresponding M7_h2 Values: {corresponding_values_M7_h2}\n")
    corresponding_values_M7_h3 = [M7_h3_values[index] for index in matching_indices]
    print(f"Corresponding M7_h3 Values: {corresponding_values_M7_h3}\n")
    corresponding_values_M7_h4 = [M7_h4_values[index] for index in matching_indices]
    print(f"Corresponding M7_h4 Values: {corresponding_values_M7_h4}\n")
    corresponding_values_M7h1 = [M7h1_values[index] for index in matching_indices]
    print(f"Corresponding M7h1 Values: {corresponding_values_M7h1}\n")
    corresponding_values_M7h2 = [M7h2_values[index] for index in matching_indices]
    print(f"Corresponding M7h2 Values: {corresponding_values_M7h2}\n")
    
    total_values_num = len(corresponding_values_M7h2)

    print(len(corresponding_values_M7h2))

    frequency_counts = Counter(corresponding_values_M7h2)
    for value, count in frequency_counts.items():
        print(f"M7h2 value: {value}, Frequency: {count}")

    for index in matching_indices:
        output_file_path = 'test_M7h2.chr2.txt'
        if M7h2_values[index] == '1' and M7h1_values[index] != '1':
            print(f"Index: {index}")
            corresponding_values_M7_h1 = M7_h1_values[index]
            print(f"Corresponding M7_h1 Value: {corresponding_values_M7_h1}")
            corresponding_values_M7_h2 = M7_h2_values[index]
            print(f"Corresponding M7_h2 Value: {corresponding_values_M7_h2}")
            corresponding_values_M7_h3 = M7_h3_values[index]
            print(f"Corresponding M7_h3 Value: {corresponding_values_M7_h3}")
            corresponding_values_M7_h4 = M7_h4_values[index]
            print(f"Corresponding M7_h4 Value: {corresponding_values_M7_h4}")
            corresponding_values_M7h1 = M7h1_values[index]
            print(f"Corresponding M7h1 Value: {corresponding_values_M7h1}")
            corresponding_values_M7h2 = M7h2_values[index]
            print(f"Corresponding M7h2 Value: {corresponding_values_M7h2}\n")
            
            corresponding_values_dict = {
                "corresponding_values_M7_h1": corresponding_values_M7_h1,
                "corresponding_values_M7_h2": corresponding_values_M7_h2,
                "corresponding_values_M7_h3": corresponding_values_M7_h3,
                "corresponding_values_M7_h4": corresponding_values_M7_h4,
            }

            print(f"corresponding values dict: {corresponding_values_dict}\n")

            largest_value = max(corresponding_values_dict.values())
            print(f"largest value: {largest_value}\n")
        # Find all columns with the largest value
            largest_columns = [column for column, variant in corresponding_values_dict.items() if variant == largest_value]
            print(f"largest columns: {largest_columns}\n")

            if largest_value == '1' and len(largest_columns) < 2:
                with open(output_file_path, 'a') as output_file:
                    output_file.write(f"{unique_value}\t{marker_id[index]}\t{largest_columns[0].split('_')[-1].split(']')[0]}\n")
            else:
                with open(output_file_path, 'a') as output_file:
                    output_file.write(f"{unique_value}\t{marker_id[index]}\tNaN\n")
        else:
            with open(output_file_path, 'a') as output_file:
                output_file.write(f"{unique_value}\t{marker_id[index]}\tNaN\n")


EOF

In [17]:
# h1-h4 are M7 parental genetic map

head -n 20 $WDR/test/test_M7h1.chr2.txt

h1tg000403l_1	seq-0-002_52803798	NaN
h1tg000403l_1	seq-0-002_52803856	NaN
h1tg000403l_1	seq-0-002_52915496	NaN
h1tg000403l_1	seq-0-002_52915504	NaN
h1tg000403l_1	seq-0-002_52915512	NaN
h1tg000403l_1	seq-0-002_52915572	NaN
h1tg000403l_1	seq-0-002_52915584	NaN
h1tg000403l_1	seq-0-002_52940536	NaN
h1tg000403l_1	seq-0-002_52944993	NaN
h1tg000403l_1	seq-0-002_52945000	NaN
h1tg000403l_1	seq-0-002_52956657	NaN
h1tg000403l_1	seq-0-002_53009574	NaN
h1tg000487l_1	seq-0-002_66525617	h4
h1tg000487l_1	seq-0-002_66529476	h4
h1tg000459l_1	seq-0-002_27568333	h1
h1tg000459l_1	seq-0-002_27698231	NaN
h1tg000459l_1	seq-0-002_27710359	NaN
h1tg000459l_1	seq-0-002_27766344	NaN
h1tg000459l_1	seq-0-002_27766380	h1
h1tg000459l_1	seq-0-002_27972595	NaN


* same script was run on Nui file

In [None]:
python << EOF


import numpy as np
from collections import Counter

file_path = 'phasingFile.chr2.txt'  # Replace with the actual file path

# Specify the column index (0-based) or name
column_index = 1  # Replace with the actual column index or name

# Read the specified column into a list
try:
    with open(file_path, 'r') as file:
        lines = [line.strip().split('\t') for line in file]
        marker_id = [line[0] if line[0] != '' else np.nan for line in lines]
        Nuih1_contigs_values = [line[15] if line[15] != '' else np.nan for line in lines]
        Nuih2_contigs_values = [line[16] if line[16] != '' else np.nan for line in lines]
        Nui_h1_values = [line[5] if line[5] != '' else np.nan for line in lines]
        Nui_h2_values = [line[6] if line[6] != '' else np.nan for line in lines]
        Nui_h3_values = [line[7] if line[7] != '' else np.nan for line in lines]
        Nui_h4_values = [line[8] if line[8] != '' else np.nan for line in lines]
        Nuih1_values = [line[11] if line[11] != '' else np.nan for line in lines]
        Nuih2_values = [line[12] if line[12] != '' else np.nan for line in lines]
    Nuih1_contigs_values_uniq = list(set(Nuih1_contigs_values))
    Nuih2_contigs_values_uniq = list(set(Nuih2_contigs_values))
except FileNotFoundError:
    print(f"File not found: {file_path}")
except IndexError:
    print(f"Column index out of range: {column_index}")


for unique_value in Nuih1_contigs_values_uniq:
    matching_indices = [index for index, value in enumerate(Nuih1_contigs_values) if value == unique_value]
    print(f"Indices of values equal to {unique_value}: {matching_indices}")
    corresponding_values_Nui_h1 = [Nui_h1_values[index] for index in matching_indices]
    print(f"Corresponding Nui_h1 Values: {corresponding_values_Nui_h1}\n")
    corresponding_values_Nui_h2 = [Nui_h2_values[index] for index in matching_indices]
    print(f"Corresponding Nui_h2 Values: {corresponding_values_Nui_h2}\n")
    corresponding_values_Nui_h3 = [Nui_h3_values[index] for index in matching_indices]
    print(f"Corresponding Nui_h3 Values: {corresponding_values_Nui_h3}\n")
    corresponding_values_Nui_h4 = [Nui_h4_values[index] for index in matching_indices]
    print(f"Corresponding Nui_h4 Values: {corresponding_values_Nui_h4}\n")
    corresponding_values_Nuih1 = [Nuih1_values[index] for index in matching_indices]
    print(f"Corresponding Nuih1 Values: {corresponding_values_Nuih1}\n")
    corresponding_values_Nuih2 = [Nuih2_values[index] for index in matching_indices]
    print(f"Corresponding Nuih2 Values: {corresponding_values_Nuih2}\n")
    
    total_values_num = len(corresponding_values_Nuih1)

    print(len(corresponding_values_Nuih1))

    frequency_counts = Counter(corresponding_values_Nuih1)
    for value, count in frequency_counts.items():
        print(f"Nuih1 value: {value}, Frequency: {count}")

     # Check if Nuih1 value is '1' before printing corresponding values
    for index in matching_indices:
        output_file_path = 'test_Nuih1.chr2.txt'
        if Nuih1_values[index] == '1' and Nuih2_values[index] != '1':
            print(f"Index: {index}")
            corresponding_values_Nui_h1 = Nui_h1_values[index]
            print(f"Corresponding Nui_h1 Value: {corresponding_values_Nui_h1}")
            corresponding_values_Nui_h2 = Nui_h2_values[index]
            print(f"Corresponding Nui_h2 Value: {corresponding_values_Nui_h2}")
            corresponding_values_Nui_h3 = Nui_h3_values[index]
            print(f"Corresponding Nui_h3 Value: {corresponding_values_Nui_h3}")
            corresponding_values_Nui_h4 = Nui_h4_values[index]
            print(f"Corresponding Nui_h4 Value: {corresponding_values_Nui_h4}")
            corresponding_values_Nuih1 = Nuih1_values[index]
            print(f"Corresponding Nuih1 Value: {corresponding_values_Nuih1}")
            corresponding_values_Nuih2 = Nuih2_values[index]
            print(f"Corresponding Nuih2 Value: {corresponding_values_Nuih2}\n")
            
            corresponding_values_dict = {
                "corresponding_values_Nui_h1": corresponding_values_Nui_h1,
                "corresponding_values_Nui_h2": corresponding_values_Nui_h2,
                "corresponding_values_Nui_h3": corresponding_values_Nui_h3,
                "corresponding_values_Nui_h4": corresponding_values_Nui_h4,
            }

            print(f"corresponding values dict: {corresponding_values_dict}\n")

            largest_value = max(corresponding_values_dict.values())
            print(f"largest value: {largest_value}\n")
        # Find all columns with the largest value
            largest_columns = [column for column, variant in corresponding_values_dict.items() if variant == largest_value]
            print(f"largest columns: {largest_columns}\n")

            if largest_value == '1' and len(largest_columns) < 2:
                with open(output_file_path, 'a') as output_file:
                    output_file.write(f"{unique_value}\t{marker_id[index]}\t{largest_columns[0].split('_')[-1].split(']')[0]}\n")
            else:
                with open(output_file_path, 'a') as output_file:
                    output_file.write(f"{unique_value}\t{marker_id[index]}\tNaN\n")

        else:
            with open(output_file_path, 'a') as output_file:
                output_file.write(f"{unique_value}\t{marker_id[index]}\tNaN\n")      


for unique_value in Nuih2_contigs_values_uniq:
    matching_indices = [index for index, value in enumerate(Nuih2_contigs_values) if value == unique_value]
    print(f"Indices of values equal to {unique_value}: {matching_indices}")
    corresponding_values_Nui_h1 = [Nui_h1_values[index] for index in matching_indices]
    print(f"Corresponding Nui_h1 Values: {corresponding_values_Nui_h1}\n")
    corresponding_values_Nui_h2 = [Nui_h2_values[index] for index in matching_indices]
    print(f"Corresponding Nui_h2 Values: {corresponding_values_Nui_h2}\n")
    corresponding_values_Nui_h3 = [Nui_h3_values[index] for index in matching_indices]
    print(f"Corresponding Nui_h3 Values: {corresponding_values_Nui_h3}\n")
    corresponding_values_Nui_h4 = [Nui_h4_values[index] for index in matching_indices]
    print(f"Corresponding Nui_h4 Values: {corresponding_values_Nui_h4}\n")
    corresponding_values_Nuih1 = [Nuih1_values[index] for index in matching_indices]
    print(f"Corresponding Nuih1 Values: {corresponding_values_Nuih1}\n")
    corresponding_values_Nuih2 = [Nuih2_values[index] for index in matching_indices]
    print(f"Corresponding Nuih2 Values: {corresponding_values_Nuih2}\n")
    
    total_values_num = len(corresponding_values_Nuih2)

    print(len(corresponding_values_Nuih2))

    frequency_counts = Counter(corresponding_values_Nuih2)
    for value, count in frequency_counts.items():
        print(f"Nuih2 value: {value}, Frequency: {count}")

     # Check if Nuih2 value is '1' before printing corresponding values
    for index in matching_indices:
        output_file_path = 'test_Nuih2.chr2.txt'
        if Nuih2_values[index] == '1' and Nuih1_values[index] != '1':
            print(f"Index: {index}")
            corresponding_values_Nui_h1 = Nui_h1_values[index]
            print(f"Corresponding Nui_h1 Value: {corresponding_values_Nui_h1}")
            corresponding_values_Nui_h2 = Nui_h2_values[index]
            print(f"Corresponding Nui_h2 Value: {corresponding_values_Nui_h2}")
            corresponding_values_Nui_h3 = Nui_h3_values[index]
            print(f"Corresponding Nui_h3 Value: {corresponding_values_Nui_h3}")
            corresponding_values_Nui_h4 = Nui_h4_values[index]
            print(f"Corresponding Nui_h4 Value: {corresponding_values_Nui_h4}")
            corresponding_values_Nuih1 = Nuih1_values[index]
            print(f"Corresponding Nuih1 Value: {corresponding_values_Nuih1}")
            corresponding_values_Nuih2 = Nuih2_values[index]
            print(f"Corresponding Nuih2 Value: {corresponding_values_Nuih2}\n")
            
            corresponding_values_dict = {
                "corresponding_values_Nui_h1": corresponding_values_Nui_h1,
                "corresponding_values_Nui_h2": corresponding_values_Nui_h2,
                "corresponding_values_Nui_h3": corresponding_values_Nui_h3,
                "corresponding_values_Nui_h4": corresponding_values_Nui_h4,
            }

            print(f"corresponding values dict: {corresponding_values_dict}\n")

            largest_value = max(corresponding_values_dict.values())
            print(f"largest value: {largest_value}\n")
        # Find all columns with the largest value
            largest_columns = [column for column, variant in corresponding_values_dict.items() if variant == largest_value]
            print(f"largest columns: {largest_columns}\n")

            if largest_value == '1' and len(largest_columns) < 2:
               
                with open(output_file_path, 'a') as output_file:
                    output_file.write(f"{unique_value}\t{marker_id[index]}\t{largest_columns[0].split('_')[-1].split(']')[0]}\n")
            else:
                with open(output_file_path, 'a') as output_file:
                    output_file.write(f"{unique_value}\t{marker_id[index]}\tNaN\n")

        else:
            with open(output_file_path, 'a') as output_file:
                output_file.write(f"{unique_value}\t{marker_id[index]}\tNaN\n")

EOF

* formating the output files for plotting 

In [18]:
# giving colours to those variants able to be assigned with parental haplotype

head -n 20 003.chr2PhasedLocus/chr2.markers_M7h1.txt

HP_hA	229781	lightgrey
HP_hA	229801	lightgrey
HP_hA	229820	lightgrey
HP_hA	229830	lightgrey
HP_hA	247149	lightgrey
HP_hA	247160	lightgrey
HP_hA	252157	lightgrey
HP_hA	268324	lightgrey
HP_hA	297604	lightgrey
HP_hA	299184	lightgrey
HP_hA	372975	lightgrey
HP_hA	373014	lightgrey
HP_hA	373016	lightgrey
HP_hA	373057	lightgrey
HP_hA	386399	lightgrey
HP_hA	428025	blue
HP_hA	439038	lightgrey
HP_hA	530031	lightgrey
HP_hA	700408	lightgrey
HP_hA	736734	lightgrey
