In [None]:
import sys
print("Python Executable Path:", sys.executable) #String that contains absolute path to Python interpreter
import os #imports operating system (desktop)
print("Operating System Desktop Path:", os.getcwd())


Python Executable Path: /Users/mmandig/miniconda3/bin/python
Operating System Desktop Path: /Users/mmandig/Desktop


In [1]:
from Bio import SeqIO # Import Biopython's Sequence Input/Output interface
from Bio.Seq import Seq   # Import Biopython's Seq class
from Bio.SeqIO.FastaIO import SimpleFastaParser #Imports Parser for Large Files   
import pandas as pd       # Import Pandas for data manipulation
import numpy as np 
import math as math       # Import scientific and mathematic calculations
import matplotlib.pyplot as plt #Import matplotlib for visualizations
from itertools import compress

In [2]:
file_path = "HIV1_ALL_2022_env_PRO.fasta"

#The SeqIO allows to parse the data frame

sequence_records = []
with open(file_path) as fasta_file:
        for record in SeqIO.parse(fasta_file , "fasta"):
            sequence_records.append({
                "Header": str(record.id),
                "Sequence": (record.seq)  
            })

seq_df = pd.DataFrame.from_records(sequence_records)


In [3]:
#This line creates a seperate table for the sequence data and converts it into a string 
seq_data = seq_df["Sequence"].astype(str)

#This line splits the string into five categories and creates 5 columns
id = seq_df['Header'].str.split(".", n = 4 ,expand=True)
id.rename(columns={0: "Subtype", 1: "Country", 2: "Year", 3: "Sequence ID", 4: "Sequence Accession Number"}, inplace= 'TRUE')
id.insert(5, "Sequence Data", seq_data) 
id
id = id[~id["Year"].isin(["x"]) & ~id["Country"].isin(["x"])]
id["Year"] = pd.to_numeric(id["Year"], errors="coerce")

#Drop rows where Year could not be converted
id = id.dropna(subset=["Year"])

#Convert the values to integer
id["Year"] = id["Year"].astype(int)

unique_num = id['Year'].drop_duplicates()
unique_num_sort = unique_num.sort_values()
if id["Year"].max() < 3000:  
    id.loc[id["Year"] < 75, "Year"] += 2000
    id.loc[(id["Year"] > 75) & (id["Year"] < 2000), "Year"] += 1900


In [4]:

USMHRP_RV144_query = id.query('Subtype in ("01_AE") & Country in ("TH") & Year >= 2003 & Year <= 2006')

print(USMHRP_RV144_query)

     Subtype Country  Year    Sequence ID Sequence Accession Number  \
7273   01_AE      TH  2003         TH7229                  KU168309   
7274   01_AE      TH  2004     04TH107542                  JN248318   
7275   01_AE      TH  2004     04TH328531                  JN248324   
7276   01_AE      TH  2004     04TH427990                  JN248327   
7277   01_AE      TH  2004     04TH505841                  JN248328   
...      ...     ...   ...            ...                       ...   
7388   01_AE      TH  2006      AA127a02R                  JX448279   
7389   01_AE      TH  2006      AA129a02R                  JX448289   
7390   01_AE      TH  2006      AA130a07R                  JX448301   
7391   01_AE      TH  2006  T501602_sga01                  JF297225   
7392   01_AE      TH  2006  T614109_sga02                  HQ691082   

                                          Sequence Data  
7273  MRVR--ETQ--M----N-W-------P-NL---W------------...  
7274  MRVK--ETQ--M----N-W------

In [5]:
def shannon_entropy (invar):

    # if the input variable is a string, then split the string into a list
    symbol_set = invar
    if isinstance (symbol_set, str):
        symbol_set = list (symbol_set)

    # create a dictionary of symbol frequencies
    setLen = len (symbol_set)
    symbolDict = {}
    for symbol in symbol_set:
        if symbol in symbolDict:
            symbolDict[symbol] += 1
        else:
            symbolDict[symbol] = 1

    # and finally, sum the entropy contributions for each symbol and return
    # the result
    entropy = 0.0
    for symbol in symbolDict:
        fraction = float (symbolDict[symbol]) / setLen
        entropy += fraction * math.log (fraction, 2)
    return (entropy * -1)

In [6]:
n_positions = len(seq_data[0])
entropyResults = [0] * n_positions #This empty list stores the amino acids characters found at that position across all sequences 

for pos in range(n_positions):
    aaChars = [] #initialize a list for storing our characters as position "pos"
    for seq in seq_data:
       aaChars.append(seq[pos]) #Append the character at position "pos" in sequence "seq" to the "aaChars" list
    entropyResults[pos] = shannon_entropy (aaChars)

# Reference Sequences RV 144

In [16]:
#This code calculates the Shannon entropy of the USMHRP RV144 reference sequence data

USMHRP_RV144_seq_data = USMHRP_RV144_query["Sequence Data"]

#print(USMHRP_RV144_seq_data.count()) - 120 sequences

USMHRP_RV144_entropy = [0] * n_positions
for pos in range(n_positions):
    aaChars = [] #initialize a list for storing our characters as position "pos"
    for USMHRP_RV144 in USMHRP_RV144_seq_data: 
       aaChars.append(USMHRP_RV144[pos]) #Append the character at position "pos" in sequence "seq" to the "aaChars" list
    #USMHRP_RV144_entropy[pos] = shannon_entropy (aaChars)

    entropy_value = shannon_entropy(aaChars)  
    USMHRP_RV144_entropy[pos] = 0.0 if entropy_value == -0.0 else entropy_value  # Convert -0.0 to 0.0


USMHRP_RV144_entropy_10 = USMHRP_RV144_entropy[:10]
print(USMHRP_RV144_entropy_10)

print(USMHRP_RV144_entropy)
print(n_positions)


[0.06952964699480783, 0.4822958260288636, 0.21084230031853213, 1.241094823127904, 0.0, 0.0, 0.9796969663540716, 0.16866093149667025, 0.34897841548534736, 0.0]
[0.06952964699480783, 0.4822958260288636, 0.21084230031853213, 1.241094823127904, 0.0, 0.0, 0.9796969663540716, 0.16866093149667025, 0.34897841548534736, 0.0, 0.0, 0.7238386951152931, 0.0, 0.0, 0.0, 0.0, 0.9102255026120049, 0.0, 0.1222915970693747, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.23788490446716992, 0.0, 0.4399456644011146, 0.6593378079029247, 0.0, 0.0, 0.0, 0.13895826373604137, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0665408054991348, 0.1916183273480325, 0.0, 0.0, 0.0, 0.1222915970693747, 0.0, 0.28639695711595625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06952964699480783, 0.0, 0.0, 0.0, 0.4822958260288636, 0.0, 0.0, 0.0, 0.06952964699480783, 0.0, 0.41698826854975235, 0.0, 0.0, 0.0, 0.6394584702161336, 0

In [8]:
#This code adds the unfiltered columns into a dataframe

filtered_lanl_map = pd.read_csv("/Users/mmandig/Downloads/Fred Hutch HIV maps/lanl_env_aa_2022.map", sep="|")

filtered_lanl_map.insert(3, "Filtered Reference Entropy", USMHRP_RV144_entropy)
print(filtered_lanl_map)

filtered_lanl_map['hxb2Pos'] = pd.to_numeric(filtered_lanl_map['hxb2Pos'], errors='coerce')
filtered_lanl_map= filtered_lanl_map.dropna()
filtered_lanl_map['hxb2Pos'] = filtered_lanl_map['hxb2Pos'].astype(int) #Store conversion of type back into Dataframe
filtered_lanl_map = filtered_lanl_map.drop(filtered_lanl_map[
    filtered_lanl_map['hxb2Pos'].between(132, 152) |
    filtered_lanl_map['hxb2Pos'].between(185, 190) |
    filtered_lanl_map['hxb2Pos'].between(396, 410) |
    filtered_lanl_map['hxb2Pos'].between(460, 467)
    
    ].index)

print(filtered_lanl_map)


      posNum hxb2Pos hxb2aa  Filtered Reference Entropy
0          1       1      M                    0.069530
1          2       2      R                    0.482296
2          3       3      V                    0.210842
3          4       4      K                    1.241095
4          5      4a      -                    0.000000
...      ...     ...    ...                         ...
2164    2165     855      L                    0.000000
2165    2166     856      L                    1.157070
2166    2167    856a      -                    0.000000
2167    2168    856b      -                    0.000000
2168    2169     857      *                    0.000000

[2169 rows x 4 columns]
      posNum  hxb2Pos hxb2aa  Filtered Reference Entropy
0          1        1      M                    0.069530
1          2        2      R                    0.482296
2          3        3      V                    0.210842
3          4        4      K                    1.241095
6          7      

In [9]:
#This code checks then filtered LANL map 
print(filtered_lanl_map[:10])
print(filtered_lanl_map)

    posNum  hxb2Pos hxb2aa  Filtered Reference Entropy
0        1        1      M                    0.069530
1        2        2      R                    0.482296
2        3        3      V                    0.210842
3        4        4      K                    1.241095
6        7        5      E                    0.979697
7        8        6      K                    0.168661
8        9        7      Y                    0.348978
11      12        8      Q                    0.723839
15      16        9      H                    0.000000
16      17       10      L                    0.910226
      posNum  hxb2Pos hxb2aa  Filtered Reference Entropy
0          1        1      M                    0.069530
1          2        2      R                    0.482296
2          3        3      V                    0.210842
3          4        4      K                    1.241095
6          7        5      E                    0.979697
...      ...      ...    ...                         

# Not correct filtering code for reference sequences

In [10]:
lanl_map = pd.read_csv("/Users/mmandig/Downloads/Fred Hutch HIV maps/lanl_env_aa_2022.map", sep="|")



lanl_map['hxb2Pos'] = pd.to_numeric(lanl_map['hxb2Pos'], errors='coerce')
lanl_map = lanl_map.dropna()
lanl_map['hxb2Pos'] = lanl_map['hxb2Pos'].astype(int) #Store conversion of type back into Dataframe
lanl_map = lanl_map.drop(lanl_map[
    lanl_map['hxb2Pos'].between(132, 152) |
    lanl_map['hxb2Pos'].between(185, 190) |
    lanl_map['hxb2Pos'].between(396, 410) |
    lanl_map['hxb2Pos'].between(460, 467)
    
    ].index)

print(lanl_map)

      posNum  hxb2Pos hxb2aa
0          1        1      M
1          2        2      R
2          3        3      V
3          4        4      K
6          7        5      E
...      ...      ...    ...
2162    2163      853      R
2163    2164      854      I
2164    2165      855      L
2165    2166      856      L
2168    2169      857      *

[807 rows x 3 columns]


In [11]:
#Create a data frame that filters out the Lanl_map based on hxb2pos to link it with the RV144 reference sequences
exclude_positions = (
    lanl_map['hxb2Pos'].between(132, 152) |
    lanl_map['hxb2Pos'].between(185, 190) |
    lanl_map['hxb2Pos'].between(396, 410) |
    lanl_map['hxb2Pos'].between(460, 467)
)

#Replaces the values of the rows where condition evaluates to true
mask = ~exclude_positions

#Applies mask to hxb2Pos
filtered_positions = list(compress(lanl_map['hxb2Pos'], mask))

#Applies mask to entropy values
filtered_entropy = list(compress(USMHRP_RV144_entropy, mask))

#Creates a dataframe with filtered results with reference sequences
filtered_rv144_reference = pd.DataFrame({'Reference Sequences Entropy': filtered_entropy})

print(filtered_rv144_reference)


     Reference Sequences Entropy
0                       0.069530
1                       0.482296
2                       0.210842
3                       1.241095
4                       0.000000
..                           ...
802                     0.000000
803                     0.398978
804                     0.000000
805                     0.000000
806                     0.000000

[807 rows x 1 columns]


In [12]:
#This box is a dump for the reference sequence code:

filtered_RV144_entropy_10 = filtered_rv144_reference[:10]
print(filtered_RV144_entropy_10)


   Reference Sequences Entropy
0                     0.069530
1                     0.482296
2                     0.210842
3                     1.241095
4                     0.000000
5                     0.000000
6                     0.979697
7                     0.168661
8                     0.348978
9                     0.000000


# Study Sequences RV144

In [13]:
#This code creates a dataframe of the unfiltered data and then filters after
usmhrp_rv144_env_map = pd.read_csv("/Users/mmandig/Downloads/Fred Hutch HIV maps/training_studies/rv144/map/env.map", sep="|")
usmhrp_rv144_env_map

Unnamed: 0,posNum,hxb2Pos,hxb2aa
0,1,1,M
1,2,2,R
2,3,3,V
3,4,4,K
4,5,5,E
...,...,...,...
1006,1007,853,R
1007,1008,854,I
1008,1009,855,L
1009,1010,856,L


In [14]:
#This code opens up the RV144 study sequences
file_path = "/Users/mmandig/Downloads/Fred Hutch HIV maps/training_studies/rv144/seq/env.aa.mindist.all.fasta"

#The SeqIO allows to parse the data frame

sequence_records = []
with open(file_path) as fasta_file:
        for record in SeqIO.parse(fasta_file , "fasta"):
            sequence_records.append({
                "Header": str(record.id),
                "Sequence": (record.seq)  
            })

seq_study_df = pd.DataFrame.from_records(sequence_records)

print(seq_study_df)

          Header                                           Sequence
0    AA036|a|01R  (M, R, V, R, G, T, R, M, N, W, P, N, L, W, -, ...
1    AA037|a|WG9  (M, K, V, K, G, T, R, M, I, W, P, D, L, W, -, ...
2    AA034|a|wg2  (-, R, V, M, G, T, Q, M, N, W, P, N, L, W, -, ...
3    AA035|a|02R  (M, R, V, K, G, T, Q, R, N, W, P, N, W, W, -, ...
4     AA032|a|02  (M, R, V, R, E, T, Q, M, N, W, P, N, L, W, -, ...
..           ...                                                ...
104  AA055|a|WG4  (M, R, V, K, G, T, Q, M, T, W, P, N, W, W, -, ...
105  AA056|a|WG6  (M, R, V, K, E, T, Q, M, N, W, P, N, L, W, -, ...
106  AA057|a|08R  (M, R, V, K, E, T, Q, R, N, W, P, N, L, W, -, ...
107  AA058|a|04R  (M, R, V, K, G, T, Q, M, N, W, P, N, L, W, -, ...
108  AA059|a|WG9  (M, R, V, K, E, T, Q, M, N, W, P, N, L, W, -, ...

[109 rows x 2 columns]


In [23]:
#This code calculates the entropy of the sequence data for the study sites

USMHRP_RV144_study_seq_data = seq_study_df["Sequence"].astype[int]

#Define n_positions based on the first sequence's length
n_positions = len(USMHRP_RV144_study_seq_data.iloc[0])

USMHRP_RV144_study_entropy = [0] * n_positions
for pos in range(n_positions):
    aaChars = [] #initialize a list for storing our characters as position "pos"
    
    for seq in USMHRP_RV144_study_seq_data: 
       aaChars.append(seq[pos]) #Append the character at position "pos" in sequence "seq" to the "aaChars" list
    #USMHRP_RV144_entropy[pos] = shannon_entropy (aaChars)

    study_entropy_value = shannon_entropy(aaChars)  
    USMHRP_RV144_study_entropy[pos] = 0.0 if study_entropy_value == -0.0 else study_entropy_value  # Convert -0.0 to 0.0

TypeError: 'method' object is not subscriptable

In [None]:


usmhrp_rv144_env_map.insert(3, "Filtered Reference Entropy", USMHRP_RV144_entropy)
print(usmhrp_rv144_env_map)

ValueError: Length of values (2169) does not match length of index (1011)

# Incorrect Code

In [None]:
#This code loads up the hxb2Pos for the rV144 study sequences

usmhrp_rv144_env_map = pd.read_csv("/Users/mmandig/Downloads/Fred Hutch HIV maps/training_studies/rv144/map/env.map", sep="|")

usmhrp_rv144_env_map['hxb2Pos'] = pd.to_numeric(usmhrp_rv144_env_map['hxb2Pos'], errors='coerce')
usmhrp_rv144_env_map = usmhrp_rv144_env_map.dropna()
usmhrp_rv144_env_map['hxb2Pos'] = usmhrp_rv144_env_map['hxb2Pos'].astype(int) #Store conversion of type back into Dataframe
usmhrp_rv144_env_map = usmhrp_rv144_env_map.drop(usmhrp_rv144_env_map[
    usmhrp_rv144_env_map['hxb2Pos'].between(132, 152) |
    usmhrp_rv144_env_map['hxb2Pos'].between(185, 190) |
    usmhrp_rv144_env_map['hxb2Pos'].between(396, 410) |
    usmhrp_rv144_env_map['hxb2Pos'].between(460, 467)
    
    ].index)

print(len(usmhrp_rv144_env_map))

usmhrp_rv144_env_map

807


Unnamed: 0,posNum,hxb2Pos,hxb2aa
0,1,1,M
1,2,2,R
2,3,3,V
3,4,4,K
4,5,5,E
...,...,...,...
1006,1007,853,R
1007,1008,854,I
1008,1009,855,L
1009,1010,856,L


In [None]:
#This code attempts to filter out the hxb2 pos and the other positions along with it


#usmhrp_rv144_env_map = usmhrp_rv144_env_map.merge(
#lanl_map[['hxb2aa', 'posNum']],
#left_on='hxb2Pos',
#right_on='posNum',
#how='left'
#)




#print(usmhrp_rv144_env_map)

usmhrp_rv144_env_map_cleaned = usmhrp_rv144_env_map.dropna()
print(usmhrp_rv144_env_map_cleaned)
usmhrp_rv144_env_map_cleaned.reset_index

      posNum  hxb2Pos hxb2aa
0          1        1      M
1          2        2      R
2          3        3      V
3          4        4      K
4          5        5      E
...      ...      ...    ...
1006    1007      853      R
1007    1008      854      I
1008    1009      855      L
1009    1010      856      L
1010    1011      857      *

[807 rows x 3 columns]


<bound method DataFrame.reset_index of       posNum  hxb2Pos hxb2aa
0          1        1      M
1          2        2      R
2          3        3      V
3          4        4      K
4          5        5      E
...      ...      ...    ...
1006    1007      853      R
1007    1008      854      I
1008    1009      855      L
1009    1010      856      L
1010    1011      857      *

[807 rows x 3 columns]>

In [None]:
#This code opens up the RV144 study sequences
file_path = "/Users/mmandig/Downloads/Fred Hutch HIV maps/training_studies/rv144/seq/env.aa.mindist.all.fasta"

#The SeqIO allows to parse the data frame

sequence_records = []
with open(file_path) as fasta_file:
        for record in SeqIO.parse(fasta_file , "fasta"):
            sequence_records.append({
                "Header": str(record.id),
                "Sequence": (record.seq)  
            })

seq_study_df = pd.DataFrame.from_records(sequence_records)

print(seq_study_df)

          Header                                           Sequence
0    AA036|a|01R  (M, R, V, R, G, T, R, M, N, W, P, N, L, W, -, ...
1    AA037|a|WG9  (M, K, V, K, G, T, R, M, I, W, P, D, L, W, -, ...
2    AA034|a|wg2  (-, R, V, M, G, T, Q, M, N, W, P, N, L, W, -, ...
3    AA035|a|02R  (M, R, V, K, G, T, Q, R, N, W, P, N, W, W, -, ...
4     AA032|a|02  (M, R, V, R, E, T, Q, M, N, W, P, N, L, W, -, ...
..           ...                                                ...
104  AA055|a|WG4  (M, R, V, K, G, T, Q, M, T, W, P, N, W, W, -, ...
105  AA056|a|WG6  (M, R, V, K, E, T, Q, M, N, W, P, N, L, W, -, ...
106  AA057|a|08R  (M, R, V, K, E, T, Q, R, N, W, P, N, L, W, -, ...
107  AA058|a|04R  (M, R, V, K, G, T, Q, M, N, W, P, N, L, W, -, ...
108  AA059|a|WG9  (M, R, V, K, E, T, Q, M, N, W, P, N, L, W, -, ...

[109 rows x 2 columns]


In [None]:
usmhrp_rv144_study_env_seq = seq_study_df["Sequence"].astype(str).fillna("")  # Ensure sequences are clean

USMHRP_RV144_study_seq_entropy = {}  # Dictionary for storing entropy values

for pos in usmhrp_rv144_env_map['hxb2Pos']:
    aaChars = []  # Initialize a list for storing amino acids at position "pos"
    
    for seq in usmhrp_rv144_study_env_seq:
        # Adjust for 1-based indexing if necessary
        if pos - 1 < len(seq):  
            aaChars.append(seq[pos - 1])  

    # Compute entropy and handle -0.0 cases
    entropy_value = shannon_entropy(aaChars)  
    USMHRP_RV144_study_seq_entropy[pos] = 0.0 if entropy_value == -0.0 else entropy_value  

# Convert dictionary values to a list and extract first 10 entropy values
USMHRP_RV144_study_seq_entropy_10 = list(USMHRP_RV144_study_seq_entropy.values())[:10]
print(USMHRP_RV144_study_seq_entropy_10)
print(len(USMHRP_RV144_study_seq_entropy)) 

[0.07526826758743452, 0.4111664900021256, 0.07526826758743452, 1.015523908491386, 1.0144099964540727, 0.26859376366582177, 0.5306119029255556, 0.9054605133084987, 1.0952981562962443, 0.07526826758743452]
807


In [None]:
#This code calculates the Shannon entropy of the USMHRP RV144 study sequence data

#usmhrp_rv144_study_env_seq = seq_study_df["Sequence"]

#USMHRP_RV144_study_seq_entropy = {} # A list of positions to help the RV144 env map iterable


#for pos in usmhrp_rv144_env_map['hxb2Pos']:
#    aaChars = [] #initialize a list for storing our characters as position "pos"
  #  for seq in usmhrp_rv144_study_env_seq: 
  #     if pos < len(seq): #Ensures the position is within the sequence length
  #       aaChars.append(seq[pos])
    #USMHRP_RV144_study_seq_entropy[pos] = shannon_entropy (aaChars)


#    entropy_value = shannon_entropy(aaChars)  
#    USMHRP_RV144_study_seq_entropy[pos] = 0.0 if entropy_value == -0.0 else entropy_value  # Convert -0.0 to 0.0


#This converts dictionary values to a list and takes the first 10 entropy values
#USMHRP_RV144_study_seq_entropy_10 = list(USMHRP_RV144_study_seq_entropy.values())[:10]
#print(USMHRP_RV144_study_seq_entropy_10)
#print(len(USMHRP_RV144_study_seq_entropy)) #807 is the length


In [None]:
#Creates a dataframe with filtered results for the study sequences
#filler = ['Filler']
#filler.fillna("Filler", inplace=True)


filtered_rv144_study = pd.DataFrame({'Study Sequences Entropy': USMHRP_RV144_study_seq_entropy})
#filtered_rv144_study['Filler'] = filler

print(filtered_rv144_study)
print(filtered_rv144_study[:10])


     Study Sequences Entropy
1                   0.075268
2                   0.411166
3                   0.075268
4                   1.015524
5                   1.014410
..                       ...
853                 0.000000
854                 0.000000
855                 0.000000
856                 0.000000
857                 0.000000

[807 rows x 1 columns]
    Study Sequences Entropy
1                  0.075268
2                  0.411166
3                  0.075268
4                  1.015524
5                  1.014410
6                  0.268594
7                  0.530612
8                  0.905461
9                  1.095298
10                 0.075268


# Subtraction of Position-Wise Entropy
Between RV144 study sequences and RV144 reference sequences

In [None]:


difference_entropy = filtered_rv144_study["Study Sequences Entropy"].values - filtered_rv144_reference['Reference Sequences Entropy'].values

join_dif_entropy = pd.DataFrame({"Difference in Entropy": difference_entropy})


join_dif_entropy["hxb2Pos"] = usmhrp_rv144_env_map["hxb2Pos"].values
print(join_dif_entropy)

     Difference in Entropy  hxb2Pos
0                 0.005739        1
1                -0.071129        2
2                -0.135574        3
3                -0.225571        4
4                 1.014410        5
..                     ...      ...
802               0.000000      853
803              -0.398978      854
804               0.000000      855
805               0.000000      856
806               0.000000      857

[807 rows x 2 columns]


# Dataframe for Check

This dataframe has the hxb2Pos, reference sequence entropies, study sequence entropies, and the entropy difference

In [None]:

#filtered_rv144_study["hxb2Pos"] = usmhrp_rv144_env_map["hxb2Pos"].values
filtered_rv144_study["hxb2Pos"] = usmhrp_rv144_env_map["hxb2Pos"].astype(int).values


print(filtered_rv144_study)




     Study Sequences Entropy  hxb2Pos
1                   0.075268        1
2                   0.411166        2
3                   0.075268        3
4                   1.015524        4
5                   1.014410        5
..                       ...      ...
853                 0.000000      853
854                 0.000000      854
855                 0.000000      855
856                 0.000000      856
857                 0.000000      857

[807 rows x 2 columns]


In [None]:
filtered_rv144_reference["hxb2Pos"] = usmhrp_rv144_env_map_cleaned["hxb2Pos"].values
print(filtered_rv144_reference)

     Reference Sequences Entropy  hxb2Pos
0                       0.069530        1
1                       0.482296        2
2                       0.210842        3
3                       1.241095        4
4                       0.000000        5
..                           ...      ...
802                     0.000000      853
803                     0.398978      854
804                     0.000000      855
805                     0.000000      856
806                     0.000000      857

[807 rows x 2 columns]


# Converts file to a csv file

In [None]:


hxb_dif_entropy = filtered_rv144_reference.join(filtered_rv144_study.set_index('hxb2Pos'), on='hxb2Pos').join(join_dif_entropy.set_index('hxb2Pos'), on='hxb2Pos')
hxb_dif_entropy

#hxb_dif_entropy.to_csv("hxb2Pos_Entropy_Differences.csv", index=False)


Unnamed: 0,Reference Sequences Entropy,hxb2Pos,Study Sequences Entropy,Difference in Entropy
0,0.069530,1,0.075268,0.005739
1,0.482296,2,0.411166,-0.071129
2,0.210842,3,0.075268,-0.135574
3,1.241095,4,1.015524,-0.225571
4,0.000000,5,1.014410,1.014410
...,...,...,...,...
802,0.000000,853,0.000000,0.000000
803,0.398978,854,0.000000,-0.398978
804,0.000000,855,0.000000,0.000000
805,0.000000,856,0.000000,0.000000
