# Location data Python script  

## Aim
Append location data to a nexus file for use in PopART (Population Analysis with Reticulate Trees)

## needed files
`locations.csv` - location data provided in supplement of Zhong et al.
`polymorphic_sites` - output file generated using DNAsp from `Haplotypes.fasta`


In [None]:
import pandas as pd

## Load location data
Read in location data from `locations.csv` and trim unneeded information

In [226]:
Zhong_locations_df = pd.read_csv("locations.csv")
Zhong_locations_trimmed_df = Zhong_locations_df[['GenBank access no.','GZ','XM','JS','TW','JP','SG','IT','LA01','LA11','NJ','TX','HW']]

## `add_location` function
<b>Description:</b> Extracts and appends location data to a nexus sequence polymorphism file in a format readable by PopART  
  
<b>Arguments:</b>  
`nexus_file`: the input nexus file containing sequence polymorphism data  
`location_df`: a data frame containing location data. The first column should contain names and subsequet columns should contain number of samples for each location.  
`ouput_file`: the name of the ouput file you would like to create. if it does not exist it will be created.
  
<b>Return:</b> an output file that is a copy of the input file with location data appended in a format readable by PopART

In [None]:
def add_location(nexus_file, location_df, output_file):
    # Open input file and output file, copy existing data from input ot output
    input_file = open(nexus_file, 'r')
    output_file = open(output_file, 'w')
    for line in input_file:
        output_file.write(line)
    
    # Calculate the number of locations and extract the location labels
    NTRAITS = len(location_df.columns) - 1
    TraitLabels = location_df.columns.values[1:]
    
    # Write file header in format readable by PopART
    output_file.write('\n\nBEGIN TRAITS;\n\n')
    output_file.write('Dimensions NTRAITS=' + str(NTRAITS) + ';\n')
    output_file.write('Format labels=yes missing=? separator=Comma;\n')
    output_file.write('TraitLabels ')
    for label in TraitLabels:
        output_file.write(label + " ")    
    output_file.write(';\n\nMATRIX\n')
    
    # Append location data in format required by PopART
    for row in location_df.values:
        output_file.write("\n" + '\'' + row[0] + '\'' + "  ")
        for value in row[1:]:
            output_file.write(str(value) + ",")

## run `add_location()` on Zhong et al. data to append location data

In [None]:
add_location("polymorphic_sites.nex", Zhong_locations_trimmed_df, 'polymorphic_sites_locations.nex')

## For readability and visualization, replace GenBank accession numbers with haplotype numbers used by Zhong et al.

In [229]:
# Create a data frame containing GB accesssion numbers and corresponding haplotype number
Zhong_labels_df = Zhong_locations_df[['Haplotype','GenBank access no.']]

# Find and replace GB accession with corresponding haplotype number
with open('polymorphic_sites_locations.nex','r') as file:
    data = file.read()
    for row in Zhong_labels_df.values:
        find = row[1]+".1" # Accession format from GenBank
        find2 = row[1] # Accession format provided by authors in supplement CSV
        replace = row[0] # Haplotype number used by authors
        data = data.replace(find, replace)
        data = data.replace(find2,replace)
with open(r'polymorphic_sites_locations.nex','w') as file:
    file.write(data)