In [1]:
import sys
print("Python Executable Path:", sys.executable) #String that contains absolute path to Python interpreter
import os #imports operating system (desktop)
print("Operating System Desktop Path:", os.getcwd())


Python Executable Path: /Users/mmandig/miniconda3/bin/python
Operating System Desktop Path: /Users/mmandig/Desktop


In [2]:
from Bio import SeqIO # Import Biopython's Sequence Input/Output interface
from Bio.Seq import Seq   # Import Biopython's Seq class
from Bio.SeqIO.FastaIO import SimpleFastaParser #Imports Parser for Large Files   
import pandas as pd       # Import Pandas for data manipulation
import numpy as np 
import math as math       # Import scientific and mathematic calculations
import matplotlib.pyplot as plt #Import matplotlib for visualizations
from itertools import compress

In [3]:
file_path = "HIV1_ALL_2022_env_PRO.fasta"

#The SeqIO allows to parse the data frame

sequence_records = []
with open(file_path) as fasta_file:
        for record in SeqIO.parse(fasta_file , "fasta"):
            sequence_records.append({
                "Header": str(record.id),
                "Sequence": (record.seq)  
            })

seq_df = pd.DataFrame.from_records(sequence_records)


In [4]:
#This line creates a seperate table for the sequence data and converts it into a string 
seq_data = seq_df["Sequence"].astype(str)

#This line splits the string into five categories and creates 5 columns
id = seq_df['Header'].str.split(".", n = 4 ,expand=True)
id.rename(columns={0: "Subtype", 1: "Country", 2: "Year", 3: "Sequence ID", 4: "Sequence Accession Number"}, inplace= 'TRUE')
id.insert(5, "Sequence Data", seq_data) 
id
id = id[~id["Year"].isin(["x"]) & ~id["Country"].isin(["x"])]
id["Year"] = pd.to_numeric(id["Year"], errors="coerce")

#Drop rows where Year could not be converted
id = id.dropna(subset=["Year"])

#Convert the values to integer
id["Year"] = id["Year"].astype(int)

unique_num = id['Year'].drop_duplicates()
unique_num_sort = unique_num.sort_values()
if id["Year"].max() < 3000:  
    id.loc[id["Year"] < 75, "Year"] += 2000
    id.loc[(id["Year"] > 75) & (id["Year"] < 2000), "Year"] += 1900


In [5]:

USMHRP_RV144_query = id.query('Subtype in ("01_AE") & Country in ("TH") & Year >= 2003 & Year <= 2006')

print(USMHRP_RV144_query)

     Subtype Country  Year    Sequence ID Sequence Accession Number  \
7273   01_AE      TH  2003         TH7229                  KU168309   
7274   01_AE      TH  2004     04TH107542                  JN248318   
7275   01_AE      TH  2004     04TH328531                  JN248324   
7276   01_AE      TH  2004     04TH427990                  JN248327   
7277   01_AE      TH  2004     04TH505841                  JN248328   
...      ...     ...   ...            ...                       ...   
7388   01_AE      TH  2006      AA127a02R                  JX448279   
7389   01_AE      TH  2006      AA129a02R                  JX448289   
7390   01_AE      TH  2006      AA130a07R                  JX448301   
7391   01_AE      TH  2006  T501602_sga01                  JF297225   
7392   01_AE      TH  2006  T614109_sga02                  HQ691082   

                                          Sequence Data  
7273  MRVR--ETQ--M----N-W-------P-NL---W------------...  
7274  MRVK--ETQ--M----N-W------

In [6]:
def shannon_entropy (invar):

    # if the input variable is a string, then split the string into a list
    symbol_set = invar
    if isinstance (symbol_set, str):
        symbol_set = list (symbol_set)

    # create a dictionary of symbol frequencies
    setLen = len (symbol_set)
    symbolDict = {}
    for symbol in symbol_set:
        if symbol in symbolDict:
            symbolDict[symbol] += 1
        else:
            symbolDict[symbol] = 1

    # and finally, sum the entropy contributions for each symbol and return
    # the result
    entropy = 0.0
    for symbol in symbolDict:
        fraction = float (symbolDict[symbol]) / setLen
        entropy += fraction * math.log (fraction, 2)
    return (entropy * -1)

In [7]:
n_positions = len(seq_data[0])
entropyResults = [0] * n_positions #This empty list stores the amino acids characters found at that position across all sequences 

for pos in range(n_positions):
    aaChars = [] #initialize a list for storing our characters as position "pos"
    for seq in seq_data:
       aaChars.append(seq[pos]) #Append the character at position "pos" in sequence "seq" to the "aaChars" list
    entropyResults[pos] = shannon_entropy (aaChars)

# Reference Sequences RV 144

In [8]:
#This code calculates the Shannon entropy of the USMHRP RV144 reference sequence data

USMHRP_RV144_seq_data = USMHRP_RV144_query["Sequence Data"]

#print(USMHRP_RV144_seq_data.count()) - 120 sequences

USMHRP_RV144_entropy = [0] * n_positions
for pos in range(n_positions):
    aaChars = [] #initialize a list for storing our characters as position "pos"
    for USMHRP_RV144 in USMHRP_RV144_seq_data: 
       aaChars.append(USMHRP_RV144[pos]) #Append the character at position "pos" in sequence "seq" to the "aaChars" list
    USMHRP_RV144_entropy[pos] = shannon_entropy (aaChars)

USMHRP_RV144_entropy_10 = USMHRP_RV144_entropy[:10]
print(USMHRP_RV144_entropy_10)

print(USMHRP_RV144_entropy)
print(n_positions)


[0.06952964699480783, 0.4822958260288636, 0.21084230031853213, 1.241094823127904, -0.0, -0.0, 0.9796969663540716, 0.16866093149667025, 0.34897841548534736, -0.0]
[0.06952964699480783, 0.4822958260288636, 0.21084230031853213, 1.241094823127904, -0.0, -0.0, 0.9796969663540716, 0.16866093149667025, 0.34897841548534736, -0.0, -0.0, 0.7238386951152931, -0.0, -0.0, -0.0, -0.0, 0.9102255026120049, -0.0, 0.1222915970693747, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.23788490446716992, -0.0, 0.4399456644011146, 0.6593378079029247, -0.0, -0.0, -0.0, 0.13895826373604137, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0665408054991348, 0.1916183273480325, -0.0, -0.0, -0.0, 0.1222915970693747, -0.0, 0.28639695711595625, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.06952964699480783, -0.0, -0.0, -0.0, 0.4822958260288636, -0.0, -0.0, -0.0, 0.06952964

In [9]:
lanl_map = pd.read_csv("/Users/mmandig/Downloads/Fred Hutch HIV maps/lanl_env_aa_2022.map", sep="|")



lanl_map['hxb2Pos'] = pd.to_numeric(lanl_map['hxb2Pos'], errors='coerce')
lanl_map = lanl_map.dropna()
lanl_map['hxb2Pos'] = lanl_map['hxb2Pos'].astype(int) #Store conversion of type back into Dataframe
lanl_map = lanl_map.drop(lanl_map[
    lanl_map['hxb2Pos'].between(132, 152) |
    lanl_map['hxb2Pos'].between(185, 190) |
    lanl_map['hxb2Pos'].between(396, 410) |
    lanl_map['hxb2Pos'].between(460, 467)
    
    ].index)

print(lanl_map)

      posNum  hxb2Pos hxb2aa
0          1        1      M
1          2        2      R
2          3        3      V
3          4        4      K
6          7        5      E
...      ...      ...    ...
2162    2163      853      R
2163    2164      854      I
2164    2165      855      L
2165    2166      856      L
2168    2169      857      *

[807 rows x 3 columns]


In [10]:
#Create a data frame that filters out the Lanl_map based on hxb2pos to link it with the RV144 reference sequences
exclude_positions = (
    lanl_map['hxb2Pos'].between(132, 152) |
    lanl_map['hxb2Pos'].between(185, 190) |
    lanl_map['hxb2Pos'].between(396, 410) |
    lanl_map['hxb2Pos'].between(460, 467)
)

#Replaces the values of the rows where condition evaluates to true
mask = ~exclude_positions

#Applies mask to hxb2Pos
filtered_positions = list(compress(lanl_map['hxb2Pos'], mask))

#Applies mask to entropy values
filtered_entropy = list(compress(USMHRP_RV144_entropy, mask))

#Creates a dataframe with filtered results with reference sequences
filtered_rv144_reference = pd.DataFrame({'Entropy': filtered_entropy})

print(filtered_rv144_reference)

      Entropy
0    0.069530
1    0.482296
2    0.210842
3    1.241095
4   -0.000000
..        ...
802 -0.000000
803  0.398978
804 -0.000000
805 -0.000000
806 -0.000000

[807 rows x 1 columns]


# Study Sequences RV144

In [11]:
usmhrp_rv144_env_map = pd.read_csv("/Users/mmandig/Downloads/Fred Hutch HIV maps/training_studies/rv144/map/env.map", sep="|")

usmhrp_rv144_env_map['hxb2Pos'] = pd.to_numeric(usmhrp_rv144_env_map['hxb2Pos'], errors='coerce')
usmhrp_rv144_env_map = usmhrp_rv144_env_map.dropna()
usmhrp_rv144_env_map['hxb2Pos'] = usmhrp_rv144_env_map['hxb2Pos'].astype(int) #Store conversion of type back into Dataframe
usmhrp_rv144_env_map = usmhrp_rv144_env_map.drop(usmhrp_rv144_env_map[
    usmhrp_rv144_env_map['hxb2Pos'].between(132, 152) |
    usmhrp_rv144_env_map['hxb2Pos'].between(185, 190) |
    usmhrp_rv144_env_map['hxb2Pos'].between(396, 410) |
    usmhrp_rv144_env_map['hxb2Pos'].between(460, 467)
    
    ].index)



In [12]:
usmhrp_rv144_study_seq_map = pd.read_csv("/Users/mmandig/Downloads/Fred Hutch HIV maps/training_studies/rv144/seq/env.aa.mindist.all.fasta")
print(usmhrp_rv144_study_seq_map)

                                          >AA036|a|01R
0    MRVRGTRMNWPNLW----KWGTLILGLVIICSASNNLWVTVYYGVP...
1                                         >AA037|a|WG9
2    MKVKGTRMIWPDLW----KWGTLILGLVIICNASNDSWVTVYYGVP...
3                                         >AA034|a|wg2
4    -RVMGTQMNWPNLW----KWGTLILGLVIICSASNDLWVTVYYGVP...
..                                                 ...
212  MRVKETQRNWPNLW----KWGTLILGLVIICSAADNLWVTVYYGVP...
213                                       >AA058|a|04R
214  MRVKGTQMNWPNLW----RWGTLILGLVIICSASNNLWVTVYYGVP...
215                                       >AA059|a|WG9
216  MRVKETQMNWPNLW----KWGTLILGLVIICSASDNLWVTVYYGVP...

[217 rows x 1 columns]


In [13]:
#This code calculates the Shannon entropy of the USMHRP RV144 study sequence data

usmhrp_rv144_study_env_seq = USMHRP_RV144_query["Sequence Data"]

USMHRP_RV144_study_seq_entropy = {} # A list of positions to help the RV144 env map iterable


for pos in usmhrp_rv144_env_map['posNum']:
    aaChars = [] #initialize a list for storing our characters as position "pos"
    for seq in usmhrp_rv144_study_env_seq: 
       if pos < len(seq): #Ensures the position is within the sequence length
         aaChars.append(seq[pos])
    USMHRP_RV144_study_seq_entropy[pos] = shannon_entropy (aaChars)

#This converts dictionary values to a list and takes the first 10 entropy values
USMHRP_RV144_study_seq_entropy_10 = list(USMHRP_RV144_study_seq_entropy.values())[:10]
print(USMHRP_RV144_study_seq_entropy_10)
print(len(USMHRP_RV144_study_seq_entropy)) #807 is the length


[0.4822958260288636, 0.21084230031853213, 1.241094823127904, -0.0, -0.0, -0.0, -0.0, 0.7238386951152931, -0.0, -0.0]
807


In [15]:
#Creates a dataframe with filtered results for the study sequences
#filler = ['Filler']
#filler.fillna("Filler", inplace=True)

filtered_rv144_study = pd.DataFrame({'Entropy': USMHRP_RV144_study_seq_entropy})
#filtered_rv144_study['Filler'] = filler

print(filtered_rv144_study)


       Entropy
1     0.482296
2     0.210842
3     1.241095
4    -0.000000
5    -0.000000
...        ...
1007 -0.000000
1008 -0.000000
1009 -0.000000
1010 -0.000000
1011 -0.000000

[807 rows x 1 columns]


# Subtraction of Position-Wise Entropy
Between RV144 study sequences and RV144 reference sequences

In [17]:


difference_entropy = filtered_rv144_study["Entropy"].values - filtered_rv144_reference['Entropy'].values

join_dif_entropy = pd.DataFrame({"Difference in Entropy": difference_entropy})

print(join_dif_entropy)

     Difference in Entropy
0                 0.412766
1                -0.271454
2                 1.030253
3                -1.241095
4                 0.000000
..                     ...
802               0.000000
803              -0.398978
804               0.000000
805               0.000000
806               0.000000

[807 rows x 1 columns]
