## Loading in full sequence data, transform to wide format and one-hot encode

In [1]:
import pandas as pd
import numpy as np
import os

##### Functions for one-hot encoding sequence files and filtering away uninformative sites

In [2]:
def one_hot_encode(df):
    # Get the mutation column names
    mut_col_names = [col_name for col_name in df.columns if col_name.endswith("_mut")]

    # Create a list to hold the encoded dataframes
    encoded_dfs = []

    # Iterate over each column in the input dataframe
    for col_name in df.columns:
        # Check if the column is a mutation column
        if col_name in mut_col_names:
            # Copy the mutation column to the encoded dataframe
            encoded_dfs.append(df[col_name])
        else:
            # Create a list of new column names for amino acids
            new_col_names = [f"{col_name}_{aa}" for aa in ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]]

            # Create a new dataframe with one-hot encoded columns
            encoded_cols = pd.get_dummies(df[col_name], prefix=col_name)
            encoded_cols = encoded_cols.reindex(columns=new_col_names, fill_value=0)

            # Append the encoded columns to the list of encoded dataframes
            encoded_dfs.append(encoded_cols)

    # Concatenate all the encoded dataframes along the columns axis
    encoded_df = pd.concat(encoded_dfs, axis=1)

    return encoded_df

In [3]:
def filter_uninformative_sites(encoded_df):
    # Find the indices of columns with both 0 and 1 values
    informative_cols = encoded_df.columns[(encoded_df.eq(0).any() & encoded_df.eq(1).any())]

    # Filter the encoded DataFrame to keep only the informative columns
    filtered_df = encoded_df[informative_cols]

    return filtered_df

### PB2

In [32]:
# loading data
pb2_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/pb2/pb2_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
pb2_fullseqs['branch_len'] = pd.to_numeric(pb2_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
pb2_fullseqs_filtered = pb2_fullseqs[pb2_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
pb2_fullseqs_trait_to = pb2_fullseqs_filtered.assign(Trait=pb2_fullseqs_filtered.trait_from + "-" + pb2_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
pb2_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 761)])

data = []
for i in range(len(pb2_fullseqs_trait_to)):
    seq_str = pb2_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [pb2_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 761)]
pb2_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

pb2_aa_columns_to = pb2_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
pb2_fullseqs_trait_from = pb2_fullseqs_filtered.assign(Trait=pb2_fullseqs_filtered.trait_from + "-" + pb2_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
pb2_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 761)])

data = []
for i in range(len(pb2_fullseqs_trait_from)):
    seq_str = pb2_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [pb2_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 761)]
pb2_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

pb2_aa_columns_from = pb2_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
pb2_fullseq_wide_mut = pd.concat([pb2_fullseqs_trait_to_wide, (pb2_fullseqs_trait_from_wide.iloc[:, 1:] != pb2_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_pb2 = [f'pos{i}_mut' for i in range(1, 761)]
pb2_fullseq_wide_mut.columns = list(pb2_fullseqs_trait_to_wide.columns) + mutation_columns_pb2
pb2_aa_columns_mut = pb2_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [26]:
pb2_aa_onehot = one_hot_encode(pb2_aa_columns_mut)
pb2_aa_onehot_traits = pd.concat([pb2_fullseq_wide_mut.Trait, pb2_aa_onehot], axis=1)
pb2_aa_onehot_traits2 = pb2_aa_onehot_traits.assign(Trait_num=pb2_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
pb2_onehot = pb2_aa_onehot_traits2.loc[:, ['Trait_num'] + list(pb2_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [30]:
pb2_onehot_filtered = filter_uninformative_sites(pb2_onehot)

##### Save as csv

In [31]:
folder_name = "pb2"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "pb2_onehot_filtered.csv")
pb2_onehot_filtered.to_csv(file_path, index=False)

### PB1

In [34]:
# loading data
pb1_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/pb1/pb1_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
pb1_fullseqs['branch_len'] = pd.to_numeric(pb1_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
pb1_fullseqs_filtered = pb1_fullseqs[pb1_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
pb1_fullseqs_trait_to = pb1_fullseqs_filtered.assign(Trait=pb1_fullseqs_filtered.trait_from + "-" + pb1_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
pb1_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 759)])

data = []
for i in range(len(pb1_fullseqs_trait_to)):
    seq_str = pb1_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [pb1_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 759)]
pb1_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

pb1_aa_columns_to = pb1_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
pb1_fullseqs_trait_from = pb1_fullseqs_filtered.assign(Trait=pb1_fullseqs_filtered.trait_from + "-" + pb1_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
pb1_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 759)])

data = []
for i in range(len(pb1_fullseqs_trait_from)):
    seq_str = pb1_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [pb1_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 759)]
pb1_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

pb1_aa_columns_from = pb1_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
pb1_fullseq_wide_mut = pd.concat([pb1_fullseqs_trait_to_wide, (pb1_fullseqs_trait_from_wide.iloc[:, 1:] != pb1_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_pb1 = [f'pos{i}_mut' for i in range(1, 759)]
pb1_fullseq_wide_mut.columns = list(pb1_fullseqs_trait_to_wide.columns) + mutation_columns_pb1
pb1_aa_columns_mut = pb1_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [35]:
pb1_aa_onehot = one_hot_encode(pb1_aa_columns_mut)
pb1_aa_onehot_traits = pd.concat([pb1_fullseq_wide_mut.Trait, pb1_aa_onehot], axis=1)
pb1_aa_onehot_traits2 = pb1_aa_onehot_traits.assign(Trait_num=pb1_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
pb1_onehot = pb1_aa_onehot_traits2.loc[:, ['Trait_num'] + list(pb1_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [36]:
pb1_onehot_filtered = filter_uninformative_sites(pb1_onehot)

##### Save as csv

In [37]:
folder_name = "pb1"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "pb1_onehot_filtered.csv")
pb1_onehot_filtered.to_csv(file_path, index=False)

### PB1-F2

In [38]:
# loading data
pb1f2_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/pb1/pb1-f2_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
pb1f2_fullseqs['branch_len'] = pd.to_numeric(pb1f2_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
pb1f2_fullseqs_filtered = pb1f2_fullseqs[pb1f2_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
pb1f2_fullseqs_trait_to = pb1f2_fullseqs_filtered.assign(Trait=pb1f2_fullseqs_filtered.trait_from + "-" + pb1f2_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
pb1f2_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 102)])

data = []
for i in range(len(pb1f2_fullseqs_trait_to)):
    seq_str = pb1f2_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [pb1f2_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 102)]
pb1f2_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

pb1f2_aa_columns_to = pb1f2_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
pb1f2_fullseqs_trait_from = pb1f2_fullseqs_filtered.assign(Trait=pb1f2_fullseqs_filtered.trait_from + "-" + pb1f2_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
pb1f2_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 102)])

data = []
for i in range(len(pb1f2_fullseqs_trait_from)):
    seq_str = pb1f2_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [pb1f2_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 102)]
pb1f2_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

pb1f2_aa_columns_from = pb1f2_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
pb1f2_fullseq_wide_mut = pd.concat([pb1f2_fullseqs_trait_to_wide, (pb1f2_fullseqs_trait_from_wide.iloc[:, 1:] != pb1f2_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_pb1f2 = [f'pos{i}_mut' for i in range(1, 102)]
pb1f2_fullseq_wide_mut.columns = list(pb1f2_fullseqs_trait_to_wide.columns) + mutation_columns_pb1f2
pb1f2_aa_columns_mut = pb1f2_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [39]:
pb1f2_aa_onehot = one_hot_encode(pb1f2_aa_columns_mut)
pb1f2_aa_onehot_traits = pd.concat([pb1f2_fullseq_wide_mut.Trait, pb1f2_aa_onehot], axis=1)
pb1f2_aa_onehot_traits2 = pb1f2_aa_onehot_traits.assign(Trait_num=pb1f2_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
pb1f2_onehot = pb1f2_aa_onehot_traits2.loc[:, ['Trait_num'] + list(pb1f2_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [40]:
pb1f2_onehot_filtered = filter_uninformative_sites(pb1f2_onehot)

##### Save as csv

In [41]:
folder_name = "pb1"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "pb1-f2_onehot_filtered.csv")
pb1f2_onehot_filtered.to_csv(file_path, index=False)

### PA

In [42]:
# loading data
pa_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/pa/pa_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
pa_fullseqs['branch_len'] = pd.to_numeric(pa_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
pa_fullseqs_filtered = pa_fullseqs[pa_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
pa_fullseqs_trait_to = pa_fullseqs_filtered.assign(Trait=pa_fullseqs_filtered.trait_from + "-" + pa_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
pa_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 718)])

data = []
for i in range(len(pa_fullseqs_trait_to)):
    seq_str = pa_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [pa_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 718)]
pa_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

pa_aa_columns_to = pa_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
pa_fullseqs_trait_from = pa_fullseqs_filtered.assign(Trait=pa_fullseqs_filtered.trait_from + "-" + pa_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
pa_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 718)])

data = []
for i in range(len(pa_fullseqs_trait_from)):
    seq_str = pa_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [pa_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 718)]
pa_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

pa_aa_columns_from = pa_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
pa_fullseq_wide_mut = pd.concat([pa_fullseqs_trait_to_wide, (pa_fullseqs_trait_from_wide.iloc[:, 1:] != pa_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_pa = [f'pos{i}_mut' for i in range(1, 718)]
pa_fullseq_wide_mut.columns = list(pa_fullseqs_trait_to_wide.columns) + mutation_columns_pa
pa_aa_columns_mut = pa_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [43]:
pa_aa_onehot = one_hot_encode(pa_aa_columns_mut)
pa_aa_onehot_traits = pd.concat([pa_fullseq_wide_mut.Trait, pa_aa_onehot], axis=1)
pa_aa_onehot_traits2 = pa_aa_onehot_traits.assign(Trait_num=pa_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
pa_onehot = pa_aa_onehot_traits2.loc[:, ['Trait_num'] + list(pa_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [44]:
pa_onehot_filtered = filter_uninformative_sites(pa_onehot)

##### Save as csv

In [45]:
folder_name = "pa"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "pa_onehot_filtered.csv")
pa_onehot_filtered.to_csv(file_path, index=False)

### PA-X

In [46]:
# loading data
pax_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/pa/pa-x_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
pax_fullseqs['branch_len'] = pd.to_numeric(pax_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
pax_fullseqs_filtered = pax_fullseqs[pax_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
pax_fullseqs_trait_to = pax_fullseqs_filtered.assign(Trait=pax_fullseqs_filtered.trait_from + "-" + pax_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
pax_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 262)])

data = []
for i in range(len(pax_fullseqs_trait_to)):
    seq_str = pax_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [pax_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 262)]
pax_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

pax_aa_columns_to = pax_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
pax_fullseqs_trait_from = pax_fullseqs_filtered.assign(Trait=pax_fullseqs_filtered.trait_from + "-" + pax_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
pax_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 262)])

data = []
for i in range(len(pax_fullseqs_trait_from)):
    seq_str = pax_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [pax_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 262)]
pax_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

pax_aa_columns_from = pax_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
pax_fullseq_wide_mut = pd.concat([pax_fullseqs_trait_to_wide, (pax_fullseqs_trait_from_wide.iloc[:, 1:] != pax_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_pax = [f'pos{i}_mut' for i in range(1, 262)]
pax_fullseq_wide_mut.columns = list(pax_fullseqs_trait_to_wide.columns) + mutation_columns_pax
pax_aa_columns_mut = pax_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [47]:
pax_aa_onehot = one_hot_encode(pax_aa_columns_mut)
pax_aa_onehot_traits = pd.concat([pax_fullseq_wide_mut.Trait, pax_aa_onehot], axis=1)
pax_aa_onehot_traits2 = pax_aa_onehot_traits.assign(Trait_num=pax_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
pax_onehot = pax_aa_onehot_traits2.loc[:, ['Trait_num'] + list(pax_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [48]:
pax_onehot_filtered = filter_uninformative_sites(pax_onehot)

##### Save as csv

In [49]:
folder_name = "pa"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "pa-x_onehot_filtered.csv")
pax_onehot_filtered.to_csv(file_path, index=False)

### H1

In [51]:
# loading data
h1_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/ha/h1_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
h1_fullseqs['branch_len'] = pd.to_numeric(h1_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
h1_fullseqs_filtered = h1_fullseqs[h1_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
h1_fullseqs_trait_to = h1_fullseqs_filtered.assign(Trait=h1_fullseqs_filtered.trait_from + "-" + h1_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
h1_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 567)])

data = []
for i in range(len(h1_fullseqs_trait_to)):
    seq_str = h1_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [h1_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 567)]
h1_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

h1_aa_columns_to = h1_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
h1_fullseqs_trait_from = h1_fullseqs_filtered.assign(Trait=h1_fullseqs_filtered.trait_from + "-" + h1_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
h1_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 567)])

data = []
for i in range(len(h1_fullseqs_trait_from)):
    seq_str = h1_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [h1_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 567)]
h1_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

h1_aa_columns_from = h1_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
h1_fullseq_wide_mut = pd.concat([h1_fullseqs_trait_to_wide, (h1_fullseqs_trait_from_wide.iloc[:, 1:] != h1_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_h1 = [f'pos{i}_mut' for i in range(1, 567)]
h1_fullseq_wide_mut.columns = list(h1_fullseqs_trait_to_wide.columns) + mutation_columns_h1
h1_aa_columns_mut = h1_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [52]:
h1_aa_onehot = one_hot_encode(h1_aa_columns_mut)
h1_aa_onehot_traits = pd.concat([h1_fullseq_wide_mut.Trait, h1_aa_onehot], axis=1)
h1_aa_onehot_traits2 = h1_aa_onehot_traits.assign(Trait_num=h1_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
h1_onehot = h1_aa_onehot_traits2.loc[:, ['Trait_num'] + list(h1_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [53]:
h1_onehot_filtered = filter_uninformative_sites(h1_onehot)

##### Save as csv

In [54]:
folder_name = "ha"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "h1_onehot_filtered.csv")
h1_onehot_filtered.to_csv(file_path, index=False)

### H3

In [55]:
# loading data
h3_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/ha/h3_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
h3_fullseqs['branch_len'] = pd.to_numeric(h3_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
h3_fullseqs_filtered = h3_fullseqs[h3_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
h3_fullseqs_trait_to = h3_fullseqs_filtered.assign(Trait=h3_fullseqs_filtered.trait_from + "-" + h3_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
h3_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 568)])

data = []
for i in range(len(h3_fullseqs_trait_to)):
    seq_str = h3_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [h3_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 568)]
h3_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

h3_aa_columns_to = h3_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
h3_fullseqs_trait_from = h3_fullseqs_filtered.assign(Trait=h3_fullseqs_filtered.trait_from + "-" + h3_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
h3_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 568)])

data = []
for i in range(len(h3_fullseqs_trait_from)):
    seq_str = h3_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [h3_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 568)]
h3_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

h3_aa_columns_from = h3_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
h3_fullseq_wide_mut = pd.concat([h3_fullseqs_trait_to_wide, (h3_fullseqs_trait_from_wide.iloc[:, 1:] != h3_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_h3 = [f'pos{i}_mut' for i in range(1, 568)]
h3_fullseq_wide_mut.columns = list(h3_fullseqs_trait_to_wide.columns) + mutation_columns_h3
h3_aa_columns_mut = h3_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [56]:
h3_aa_onehot = one_hot_encode(h3_aa_columns_mut)
h3_aa_onehot_traits = pd.concat([h3_fullseq_wide_mut.Trait, h3_aa_onehot], axis=1)
h3_aa_onehot_traits2 = h3_aa_onehot_traits.assign(Trait_num=h3_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
h3_onehot = h3_aa_onehot_traits2.loc[:, ['Trait_num'] + list(h3_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [57]:
h3_onehot_filtered = filter_uninformative_sites(h3_onehot)

##### Save as csv

In [58]:
folder_name = "ha"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "h3_onehot_filtered.csv")
h3_onehot_filtered.to_csv(file_path, index=False)

### NP

In [59]:
# loading data
np_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/np/np_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
np_fullseqs['branch_len'] = pd.to_numeric(np_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
np_fullseqs_filtered = np_fullseqs[np_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
np_fullseqs_trait_to = np_fullseqs_filtered.assign(Trait=np_fullseqs_filtered.trait_from + "-" + np_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
np_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 500)])

data = []
for i in range(len(np_fullseqs_trait_to)):
    seq_str = np_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [np_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 500)]
np_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

np_aa_columns_to = np_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
np_fullseqs_trait_from = np_fullseqs_filtered.assign(Trait=np_fullseqs_filtered.trait_from + "-" + np_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
np_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 500)])

data = []
for i in range(len(np_fullseqs_trait_from)):
    seq_str = np_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [np_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 500)]
np_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

np_aa_columns_from = np_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
np_fullseq_wide_mut = pd.concat([np_fullseqs_trait_to_wide, (np_fullseqs_trait_from_wide.iloc[:, 1:] != np_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_np = [f'pos{i}_mut' for i in range(1, 500)]
np_fullseq_wide_mut.columns = list(np_fullseqs_trait_to_wide.columns) + mutation_columns_np
np_aa_columns_mut = np_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [60]:
np_aa_onehot = one_hot_encode(np_aa_columns_mut)
np_aa_onehot_traits = pd.concat([np_fullseq_wide_mut.Trait, np_aa_onehot], axis=1)
np_aa_onehot_traits2 = np_aa_onehot_traits.assign(Trait_num=np_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
np_onehot = np_aa_onehot_traits2.loc[:, ['Trait_num'] + list(np_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [61]:
np_onehot_filtered = filter_uninformative_sites(np_onehot)

##### Save as csv

In [62]:
folder_name = "np"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "np_onehot_filtered.csv")
np_onehot_filtered.to_csv(file_path, index=False)

### N1

In [63]:
# loading data
n1_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/na/n1_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
n1_fullseqs['branch_len'] = pd.to_numeric(n1_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
n1_fullseqs_filtered = n1_fullseqs[n1_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
n1_fullseqs_trait_to = n1_fullseqs_filtered.assign(Trait=n1_fullseqs_filtered.trait_from + "-" + n1_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
n1_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 471)])

data = []
for i in range(len(n1_fullseqs_trait_to)):
    seq_str = n1_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [n1_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 471)]
n1_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

n1_aa_columns_to = n1_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
n1_fullseqs_trait_from = n1_fullseqs_filtered.assign(Trait=n1_fullseqs_filtered.trait_from + "-" + n1_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
n1_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 471)])

data = []
for i in range(len(n1_fullseqs_trait_from)):
    seq_str = n1_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [n1_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 471)]
n1_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

n1_aa_columns_from = n1_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
n1_fullseq_wide_mut = pd.concat([n1_fullseqs_trait_to_wide, (n1_fullseqs_trait_from_wide.iloc[:, 1:] != n1_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_n1 = [f'pos{i}_mut' for i in range(1, 471)]
n1_fullseq_wide_mut.columns = list(n1_fullseqs_trait_to_wide.columns) + mutation_columns_n1
n1_aa_columns_mut = n1_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [64]:
n1_aa_onehot = one_hot_encode(n1_aa_columns_mut)
n1_aa_onehot_traits = pd.concat([n1_fullseq_wide_mut.Trait, n1_aa_onehot], axis=1)
n1_aa_onehot_traits2 = n1_aa_onehot_traits.assign(Trait_num=n1_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
n1_onehot = n1_aa_onehot_traits2.loc[:, ['Trait_num'] + list(n1_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [65]:
n1_onehot_filtered = filter_uninformative_sites(n1_onehot)

##### Save as csv

In [66]:
folder_name = "na"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "n1_onehot_filtered.csv")
n1_onehot_filtered.to_csv(file_path, index=False)

### N2

In [67]:
# loading data
n2_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/na/n2_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
n2_fullseqs['branch_len'] = pd.to_numeric(n2_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
n2_fullseqs_filtered = n2_fullseqs[n2_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
n2_fullseqs_trait_to = n2_fullseqs_filtered.assign(Trait=n2_fullseqs_filtered.trait_from + "-" + n2_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
n2_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 471)])

data = []
for i in range(len(n2_fullseqs_trait_to)):
    seq_str = n2_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [n2_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 471)]
n2_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

n2_aa_columns_to = n2_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
n2_fullseqs_trait_from = n2_fullseqs_filtered.assign(Trait=n2_fullseqs_filtered.trait_from + "-" + n2_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
n2_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 471)])

data = []
for i in range(len(n2_fullseqs_trait_from)):
    seq_str = n2_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [n2_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 471)]
n2_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

n2_aa_columns_from = n2_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
n2_fullseq_wide_mut = pd.concat([n2_fullseqs_trait_to_wide, (n2_fullseqs_trait_from_wide.iloc[:, 1:] != n2_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_n2 = [f'pos{i}_mut' for i in range(1, 471)]
n2_fullseq_wide_mut.columns = list(n2_fullseqs_trait_to_wide.columns) + mutation_columns_n2
n2_aa_columns_mut = n2_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [68]:
n2_aa_onehot = one_hot_encode(n2_aa_columns_mut)
n2_aa_onehot_traits = pd.concat([n2_fullseq_wide_mut.Trait, n2_aa_onehot], axis=1)
n2_aa_onehot_traits2 = n2_aa_onehot_traits.assign(Trait_num=n2_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
n2_onehot = n2_aa_onehot_traits2.loc[:, ['Trait_num'] + list(n2_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [69]:
n2_onehot_filtered = filter_uninformative_sites(n2_onehot)

##### Save as csv

In [70]:
folder_name = "na"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "n2_onehot_filtered.csv")
n2_onehot_filtered.to_csv(file_path, index=False)

### M1

In [71]:
# loading data
m1_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/mp/m1_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
m1_fullseqs['branch_len'] = pd.to_numeric(m1_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
m1_fullseqs_filtered = m1_fullseqs[m1_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
m1_fullseqs_trait_to = m1_fullseqs_filtered.assign(Trait=m1_fullseqs_filtered.trait_from + "-" + m1_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
m1_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 253)])

data = []
for i in range(len(m1_fullseqs_trait_to)):
    seq_str = m1_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [m1_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 253)]
m1_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

m1_aa_columns_to = m1_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
m1_fullseqs_trait_from = m1_fullseqs_filtered.assign(Trait=m1_fullseqs_filtered.trait_from + "-" + m1_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
m1_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 253)])

data = []
for i in range(len(m1_fullseqs_trait_from)):
    seq_str = m1_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [m1_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 253)]
m1_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

m1_aa_columns_from = m1_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
m1_fullseq_wide_mut = pd.concat([m1_fullseqs_trait_to_wide, (m1_fullseqs_trait_from_wide.iloc[:, 1:] != m1_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_m1 = [f'pos{i}_mut' for i in range(1, 253)]
m1_fullseq_wide_mut.columns = list(m1_fullseqs_trait_to_wide.columns) + mutation_columns_m1
m1_aa_columns_mut = m1_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [72]:
m1_aa_onehot = one_hot_encode(m1_aa_columns_mut)
m1_aa_onehot_traits = pd.concat([m1_fullseq_wide_mut.Trait, m1_aa_onehot], axis=1)
m1_aa_onehot_traits2 = m1_aa_onehot_traits.assign(Trait_num=m1_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
m1_onehot = m1_aa_onehot_traits2.loc[:, ['Trait_num'] + list(m1_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [73]:
m1_onehot_filtered = filter_uninformative_sites(m1_onehot)

##### Save as csv

In [74]:
folder_name = "mp"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "m1_onehot_filtered.csv")
m1_onehot_filtered.to_csv(file_path, index=False)

### M2

In [75]:
# loading data
m2_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/mp/m2_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
m2_fullseqs['branch_len'] = pd.to_numeric(m2_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
m2_fullseqs_filtered = m2_fullseqs[m2_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
m2_fullseqs_trait_to = m2_fullseqs_filtered.assign(Trait=m2_fullseqs_filtered.trait_from + "-" + m2_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
m2_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 99)])

data = []
for i in range(len(m2_fullseqs_trait_to)):
    seq_str = m2_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [m2_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 99)]
m2_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

m2_aa_columns_to = m2_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
m2_fullseqs_trait_from = m2_fullseqs_filtered.assign(Trait=m2_fullseqs_filtered.trait_from + "-" + m2_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
m2_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 99)])

data = []
for i in range(len(m2_fullseqs_trait_from)):
    seq_str = m2_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [m2_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 99)]
m2_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

m2_aa_columns_from = m2_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
m2_fullseq_wide_mut = pd.concat([m2_fullseqs_trait_to_wide, (m2_fullseqs_trait_from_wide.iloc[:, 1:] != m2_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_m2 = [f'pos{i}_mut' for i in range(1, 99)]
m2_fullseq_wide_mut.columns = list(m2_fullseqs_trait_to_wide.columns) + mutation_columns_m2
m2_aa_columns_mut = m2_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [76]:
m2_aa_onehot = one_hot_encode(m2_aa_columns_mut)
m2_aa_onehot_traits = pd.concat([m2_fullseq_wide_mut.Trait, m2_aa_onehot], axis=1)
m2_aa_onehot_traits2 = m2_aa_onehot_traits.assign(Trait_num=m2_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
m2_onehot = m2_aa_onehot_traits2.loc[:, ['Trait_num'] + list(m2_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [77]:
m2_onehot_filtered = filter_uninformative_sites(m2_onehot)

##### Save as csv

In [78]:
folder_name = "mp"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "m2_onehot_filtered.csv")
m2_onehot_filtered.to_csv(file_path, index=False)

### NS1

In [79]:
# loading data
ns1_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/ns/ns1_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
ns1_fullseqs['branch_len'] = pd.to_numeric(ns1_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
ns1_fullseqs_filtered = ns1_fullseqs[ns1_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
ns1_fullseqs_trait_to = ns1_fullseqs_filtered.assign(Trait=ns1_fullseqs_filtered.trait_from + "-" + ns1_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
ns1_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 238)])

data = []
for i in range(len(ns1_fullseqs_trait_to)):
    seq_str = ns1_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [ns1_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 238)]
ns1_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

ns1_aa_columns_to = ns1_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
ns1_fullseqs_trait_from = ns1_fullseqs_filtered.assign(Trait=ns1_fullseqs_filtered.trait_from + "-" + ns1_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
ns1_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 238)])

data = []
for i in range(len(ns1_fullseqs_trait_from)):
    seq_str = ns1_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [ns1_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 238)]
ns1_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

ns1_aa_columns_from = ns1_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
ns1_fullseq_wide_mut = pd.concat([ns1_fullseqs_trait_to_wide, (ns1_fullseqs_trait_from_wide.iloc[:, 1:] != ns1_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_ns1 = [f'pos{i}_mut' for i in range(1, 238)]
ns1_fullseq_wide_mut.columns = list(ns1_fullseqs_trait_to_wide.columns) + mutation_columns_ns1
ns1_aa_columns_mut = ns1_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [80]:
ns1_aa_onehot = one_hot_encode(ns1_aa_columns_mut)
ns1_aa_onehot_traits = pd.concat([ns1_fullseq_wide_mut.Trait, ns1_aa_onehot], axis=1)
ns1_aa_onehot_traits2 = ns1_aa_onehot_traits.assign(Trait_num=ns1_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
ns1_onehot = ns1_aa_onehot_traits2.loc[:, ['Trait_num'] + list(ns1_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [81]:
ns1_onehot_filtered = filter_uninformative_sites(ns1_onehot)

##### Save as csv

In [82]:
folder_name = "ns"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "ns1_onehot_filtered.csv")
ns1_onehot_filtered.to_csv(file_path, index=False)

### NEP

In [83]:
# loading data
nep_fullseqs = pd.read_csv("/Users/kman/Desktop/ancestral_reconstruction_project_final/results/anclib/ns/nep_branchinfo_aa.txt", sep="\t",
                                    names=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from", "seq_to", "branch_type"])

# Convert 'branch_len' column to numeric
nep_fullseqs['branch_len'] = pd.to_numeric(nep_fullseqs['branch_len'], errors='coerce')
# Filter rows where 'branch_len' <= 20
nep_fullseqs_filtered = nep_fullseqs[nep_fullseqs['branch_len'] <= 15]


## Make wide tables of positions and amino acids in sequences from and to 
#add Trait column to seq_to file
nep_fullseqs_trait_to = nep_fullseqs_filtered.assign(Trait=nep_fullseqs_filtered.trait_from + "-" + nep_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_from"])\
                                                  .iloc[1:]
# Convert to wide format
nep_fullseqs_trait_to_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 123)])

data = []
for i in range(len(nep_fullseqs_trait_to)):
    seq_str = nep_fullseqs_trait_to.iloc[i].seq_to
    seq_vec = list(seq_str)
    row_data = [nep_fullseqs_trait_to.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 123)]
nep_fullseqs_trait_to_wide = pd.DataFrame(data=data, columns=columns)

nep_aa_columns_to = nep_fullseqs_trait_to_wide.iloc[:, 1:]

#add Trait column to seq_from file
nep_fullseqs_trait_from = nep_fullseqs_filtered.assign(Trait=nep_fullseqs_filtered.trait_from + "-" + nep_fullseqs_filtered.trait_to)\
                                                  .drop(columns=["node_from", "node_to", "branch_len", "trait_from", "trait_to", "traitprob_from", "traitprob_to", "seq_to"])\
                                                  .iloc[1:]

# Convert to wide format
nep_fullseqs_trait_from_wide = pd.DataFrame(columns=["Trait"]+["pos"+str(i) for i in range(1, 123)])

data = []
for i in range(len(nep_fullseqs_trait_from)):
    seq_str = nep_fullseqs_trait_from.iloc[i].seq_from
    seq_vec = list(seq_str)
    row_data = [nep_fullseqs_trait_from.iloc[i].Trait] + seq_vec
    data.append(row_data)

columns = ["Trait"] + ["pos"+str(i) for i in range(1, 123)]
nep_fullseqs_trait_from_wide = pd.DataFrame(data=data, columns=columns)

nep_aa_columns_from = nep_fullseqs_trait_from_wide.iloc[:, 1:]


# Create a new DataFrame for mutation information
nep_fullseq_wide_mut = pd.concat([nep_fullseqs_trait_to_wide, (nep_fullseqs_trait_from_wide.iloc[:, 1:] != nep_fullseqs_trait_to_wide.iloc[:, 1:]).astype(int)], axis=1)

# Rename the mutation columns
mutation_columns_nep = [f'pos{i}_mut' for i in range(1, 123)]
nep_fullseq_wide_mut.columns = list(nep_fullseqs_trait_to_wide.columns) + mutation_columns_nep
nep_aa_columns_mut = nep_fullseq_wide_mut.iloc[:, 1:]

##### one hot encode

In [84]:
nep_aa_onehot = one_hot_encode(nep_aa_columns_mut)
nep_aa_onehot_traits = pd.concat([nep_fullseq_wide_mut.Trait, nep_aa_onehot], axis=1)
nep_aa_onehot_traits2 = nep_aa_onehot_traits.assign(Trait_num=nep_aa_onehot_traits.Trait.replace({"human-human": 0, "swine-swine": 1, "human-swine": 2, "swine-human": 3})).drop(columns=["Trait"])
nep_onehot = nep_aa_onehot_traits2.loc[:, ['Trait_num'] + list(nep_aa_onehot_traits2.columns[:-1])]

##### Filter away uninformative sites

In [85]:
nep_onehot_filtered = filter_uninformative_sites(nep_onehot)

##### Save as csv

In [86]:
folder_name = "ns"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

file_path = os.path.join(folder_name, "nep_onehot_filtered.csv")
nep_onehot_filtered.to_csv(file_path, index=False)