# Get objects for gggenomes plot

This notebook creates objects required by gggenomes to do a gene synteny and homology figure. More details on https://github.com/MGXlab/genes_synteny.

In [2]:
#Activate print all activities
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Open alv_genes_blast

In [3]:
import pandas as pd
alv_genes = pd.read_csv('../objects/alv_genes1.csv', header = 0) 
alv_genes.head()

Unnamed: 0,bin_id,seq_id,start,end,strand,feat_id,name
0,DD5b,46,373085,373903,+,sumT_DD5b,sumT
1,DSM1208,1,3717908,3719089,+,moeA_1_DSM1208,moeA_1
2,DSM1208,1,3719535,3719975,+,moaE_1_DSM1208,moaE_1
3,DSM1208,1,4313511,4313753,+,moaD_DSM1208,moaD
4,DSM1208,1,4313763,4314173,+,moaE_2_DSM1208,moaE_2


In [4]:
#Remove the underscore and number of paralog of all genes
alv_genes['feat_id'] = alv_genes['feat_id'].str.replace(r'_1', '', regex=True)
alv_genes['feat_id'] = alv_genes['feat_id'].str.replace(r'_2', '', regex=True)

alv_genes['name'] = alv_genes['name'].str.replace(r'_1', '', regex=True)
alv_genes['name'] = alv_genes['name'].str.replace(r'_2', '', regex=True)

#Alvaro said moaC and moaC2 are NOT the same!
#alv_genes['feat_id'] = alv_genes['feat_id'].str.replace(r'moaC2', 'moaC', regex=True)
#alv_genes['name'] = alv_genes['name'].str.replace(r'moaC2', 'moaC', regex=True)
alv_genes.head()

Unnamed: 0,bin_id,seq_id,start,end,strand,feat_id,name
0,DD5b,46,373085,373903,+,sumT_DD5b,sumT
1,DSM1208,1,3717908,3719089,+,moeA_DSM1208,moeA
2,DSM1208,1,3719535,3719975,+,moaE_DSM1208,moaE
3,DSM1208,1,4313511,4313753,+,moaD_DSM1208,moaD
4,DSM1208,1,4313763,4314173,+,moaE_DSM1208,moaE


In [5]:
alv_genes.to_csv('../objects/alv_genes.csv', index = False)

# Create alv_seqs_blast

Create object alv_seqs based on information of: https://thackl.github.io/gggenomes/articles/gggenomes.html#sequence-track---there-can-be-only-one

Columns: seq_id,bin_id,length

In [6]:
#Put list in order of phylogenetic tree
sp_lst = ['ZONMW-30', 'ZONMW-20', 'HI1', 'DSM1208', 'DSM15718', 'DD5b', 'UW101', 'IR1']
sp_lst

['ZONMW-30', 'ZONMW-20', 'HI1', 'DSM1208', 'DSM15718', 'DD5b', 'UW101', 'IR1']

In [7]:
#Create an empty DataFrame with specified column names
alv_seqs = pd.DataFrame(columns=["bin_id", "seq_id", "length"])

empty = 0

# Iterate through the list (each species is already in the correct order) 
for bin_id in sp_lst:
    
    #Get ortholog genes for this genome ID
    tmp = alv_genes[alv_genes['bin_id'] == bin_id]
    
    #Get list of seq_ids (contigs)
    seq_lst = tmp['seq_id'].unique().tolist()
    
    #If tmp is empty, it means the name correspondence is not correct
    if (tmp.empty):
        print("Error found in looking for orthologs in alv_genes_blast!")
    else:

        #Get min and max lengths for every bin_id (contig)
        for seq in seq_lst:

            #Get ortholog genes for this bin_id
            tmp2 = tmp[tmp['seq_id'] == seq]
        
            #Get min and max reported lengths for each bin_id of this seq_id 
            max = tmp2[['start','end']].max().max()
            min = tmp2[['start','end']].min().min()
        
            width = max - min + 1
            #I will set the length of the contig as the max position plus ten
            length = int(max + 10)

            # Append a new row to the DataFrame with the values
            new_row = pd.Series({"bin_id": bin_id, "seq_id": str(seq), "length": length})
            alv_seqs = pd.concat([alv_seqs, pd.DataFrame([new_row])], ignore_index=True)

In [8]:
alv_seqs

#Swap rows of indexes 2 and row 3 - To change the order of contigs of species IR1, put the longest first
alv_seqs.iloc[[10, 11]] = alv_seqs.iloc[[11, 10]].values

alv_seqs

Unnamed: 0,bin_id,seq_id,length
0,ZONMW-30,4,172314
1,ZONMW-30,7,11335
2,ZONMW-30,13,19644
3,ZONMW-20,10,176073
4,ZONMW-20,17,121851
5,HI1,1,3188588
6,DSM1208,1,4337566
7,DSM15718,30,51564
8,DD5b,46,373913
9,UW101,1,5613620


Unnamed: 0,bin_id,seq_id,length
0,ZONMW-30,4,172314
1,ZONMW-30,7,11335
2,ZONMW-30,13,19644
3,ZONMW-20,10,176073
4,ZONMW-20,17,121851
5,HI1,1,3188588
6,DSM1208,1,4337566
7,DSM15718,30,51564
8,DD5b,46,373913
9,UW101,1,5613620


In [9]:
alv_seqs.to_csv('../objects/alv_seqs.csv', index = False)

# Create alv_seqs_blast2 and alv_genes_blast2 with shortened genome length

To improve visualization, the length of the genomes in alv_seqs and the start and end of genes in alv_genes is shortened.  

In [10]:
import pandas as pd

alv_genes = pd.read_csv('../objects/alv_genes.csv', header = 0) #Not anchored
alv_seqs = pd.read_csv('../objects/alv_seqs.csv', header = 0, index_col=False)
alv_genes.head()
alv_seqs.head()

Unnamed: 0,bin_id,seq_id,start,end,strand,feat_id,name
0,DD5b,46,373085,373903,+,sumT_DD5b,sumT
1,DSM1208,1,3717908,3719089,+,moeA_DSM1208,moeA
2,DSM1208,1,3719535,3719975,+,moaE_DSM1208,moaE
3,DSM1208,1,4313511,4313753,+,moaD_DSM1208,moaD
4,DSM1208,1,4313763,4314173,+,moaE_DSM1208,moaE


Unnamed: 0,bin_id,seq_id,length
0,ZONMW-30,4,172314
1,ZONMW-30,7,11335
2,ZONMW-30,13,19644
3,ZONMW-20,10,176073
4,ZONMW-20,17,121851


In [11]:
#Get list of species in the right order
sp_lst = list(dict.fromkeys(alv_seqs['bin_id'] ))
sp_lst

['ZONMW-30', 'ZONMW-20', 'HI1', 'DSM1208', 'DSM15718', 'DD5b', 'UW101', 'IR1']

In [12]:
##### ON USE ######

###################

#This code subtracts starts and ends of genomes
# based on global starts and ends

import numpy as np

#Copy dataframes
alv_genes2 = alv_genes.copy()
alv_seqs2 = alv_seqs.copy()

#Go through every species (bin_id)
for sp in sp_lst:
    
    #Select all genes of the current species
    alv_genes_bin = alv_genes[alv_genes['bin_id'] == sp]
    
    #Go through every seq_id (contig)
    seq_lst = alv_genes_bin['seq_id'].unique().tolist()
    
    #Go through every contig (seq_id)
    for seq_id in seq_lst:

        #Select all genes of the current species(bin_id) and seq_id
        alv_genes_seq = alv_genes_bin[alv_genes_bin['seq_id'] == seq_id]        
        
        #Get min reported positions of all genes
        min = alv_genes_seq[['start','end']].min().min()
        
        ### Substract min length from sequences' start and end ###
        alv_genes2.loc[(alv_genes2['bin_id'] == sp) & (alv_genes2['seq_id'] == seq_id), 'start' ] -=  min
        alv_genes2.loc[(alv_genes2['bin_id'] == sp) & (alv_genes2['seq_id'] == seq_id), 'end' ] -=  min
        
        ### Substract min length from genomes ###
        alv_seqs2.loc[(alv_seqs2['bin_id'] == sp) & (alv_seqs2['seq_id'] == seq_id), 'length' ] -=  min - 1       
        
#alv_genes2['start'] = alv_genes2['start'].astype(int)
#alv_genes2['end'] = alv_genes2['end'].astype(int)
#alv_seqs2['length'] = alv_seqs2['start'].astype(int)

In [13]:
alv_genes2.head(n=10)
alv_seqs2.head(n=10)

Unnamed: 0,bin_id,seq_id,start,end,strand,feat_id,name
0,DD5b,46,0,818,+,sumT_DD5b,sumT
1,DSM1208,1,0,1181,+,moeA_DSM1208,moeA
2,DSM1208,1,1627,2067,+,moaE_DSM1208,moaE
3,DSM1208,1,595603,595845,+,moaD_DSM1208,moaD
4,DSM1208,1,595855,596265,+,moaE_DSM1208,moaE
5,DSM1208,1,596428,597327,+,moaA_DSM1208,moaA
6,DSM1208,1,597332,597805,+,moaC_DSM1208,moaC
7,DSM1208,1,597789,598385,+,mobA_DSM1208,mobA
8,DSM1208,1,598382,599455,+,moeZ_DSM1208,moeZ
9,DSM1208,1,618458,619648,-,moeA_DSM1208,moeA


Unnamed: 0,bin_id,seq_id,length
0,ZONMW-30,4,154620
1,ZONMW-30,7,2428
2,ZONMW-30,13,1690
3,ZONMW-20,10,77708
4,ZONMW-20,17,595
5,HI1,1,2392577
6,DSM1208,1,619659
7,DSM15718,30,805
8,DD5b,46,829
9,UW101,1,7527


In [14]:
len(alv_genes2[alv_genes2['start'] >= 0])
len(alv_genes2[alv_genes2['start'] < 0])

52

0

In [15]:
alv_seqs2.to_csv('../objects/alv_seqs2.csv', index = False)
alv_genes2.to_csv('../objects/alv_genes2.csv', index = False)

# Create alv_ava_blast2

alv_ava contains the homology between contigs.

In [16]:
import pandas as pd

alv_seqs2 = pd.read_csv('../objects/alv_seqs2.csv', header = 0, index_col=False)
alv_seqs.head()

Unnamed: 0,bin_id,seq_id,length
0,ZONMW-30,4,172314
1,ZONMW-30,7,11335
2,ZONMW-30,13,19644
3,ZONMW-20,10,176073
4,ZONMW-20,17,121851


In [17]:
#New ON USE###
##################

#Create empty DataFrame with specified column names
alv_ava2 = pd.DataFrame(columns=["bin_id", "seq_id", "bin_id2", "seq_id2"])

#Get list of species in the right order
sp_lst = list(dict.fromkeys(alv_seqs['bin_id'] ))
sp_lst

#For every line of alv_species, assign genome homology/orthology
for i in range(len(sp_lst)):
    
    #Do not make comparisons with the last species
    if(i < 7):

        bin_id = sp_lst[i]
        bin_id2 = sp_lst[i + 1]

        #Select all seqs (contigs) of the current bin/species
        alv_seqs2_bin = alv_seqs2[alv_seqs2['bin_id'] == bin_id]

        #Select all seqs (contigs) of the next bin/species
        alv_seqs2_bin2 = alv_seqs2[alv_seqs2['bin_id'] == bin_id2]
        
        #Get list of every seq_id (contig) of the current bin/species
        seq_lst = alv_seqs2_bin['seq_id'].unique().tolist()

        #Get list of every seq_id (contig) of the next bin/species
        seq_lst2 = alv_seqs2_bin2['seq_id'].unique().tolist()
    
        #Go through every contig (seq_id)
        for seq_id in seq_lst:

            #Go through every contig (seq_id)
            for seq_id2 in seq_lst2:
            
                # Append a new row to the DataFrame with the values
                new_row = pd.Series({"bin_id": bin_id, "seq_id": seq_id, "bin_id2": bin_id2, "seq_id2": seq_id2})
                alv_ava2 = pd.concat([alv_ava2, pd.DataFrame([new_row])], ignore_index=True)

alv_ava2.head()        

['ZONMW-30', 'ZONMW-20', 'HI1', 'DSM1208', 'DSM15718', 'DD5b', 'UW101', 'IR1']

Unnamed: 0,bin_id,seq_id,bin_id2,seq_id2
0,ZONMW-30,4,ZONMW-20,10
1,ZONMW-30,4,ZONMW-20,17
2,ZONMW-30,7,ZONMW-20,10
3,ZONMW-30,7,ZONMW-20,17
4,ZONMW-30,13,ZONMW-20,10


In [18]:
alv_ava2.to_csv('../objects/alv_ava2.csv', index = False)

# Create alv_prot_ava_blast2

alv_ava contains the homology between genes.

In [19]:
import pandas as pd

alv_genes2.to_csv('../objects/alv_genes2.csv', index = False)      
alv_genes2.head()
len(alv_genes2)

Unnamed: 0,bin_id,seq_id,start,end,strand,feat_id,name
0,DD5b,46,0,818,+,sumT_DD5b,sumT
1,DSM1208,1,0,1181,+,moeA_DSM1208,moeA
2,DSM1208,1,1627,2067,+,moaE_DSM1208,moaE
3,DSM1208,1,595603,595845,+,moaD_DSM1208,moaD
4,DSM1208,1,595855,596265,+,moaE_DSM1208,moaE


52

In [20]:
#Create empty DataFrame with specified column names
alv_prot_ava2 = pd.DataFrame(columns=["bin_id", "feat_id",
                                      "bin_id2", "feat_id2"])

pd.set_option('mode.chained_assignment', None)
import numpy as np

#Get list of species in the right order
sp_lst = list(dict.fromkeys(alv_seqs['bin_id'] ))

#For every line of alv_species, get genes and assign homologies
for i in range(len(sp_lst)):
    
    #Get species and next species
    sp = sp_lst[i]

    #print(i, len(sp_lst))
    
    #Do not make comparisons with the last species
    if(i < (len(sp_lst) - 1) ):    

        sp2 = sp_lst[i + 1]
        #print(sp, sp2)
        
        #Select all genes of the current species
        alv_genes_bin = alv_genes[alv_genes['bin_id'] == sp]

        #Select all genes of the next species
        alv_genes_bin2 = alv_genes[alv_genes['bin_id'] == sp2]
        
        #Go through every gene of current species and compare to all genes of next species. Assign homologies 
        for x in range (len(alv_genes_bin)):
        
            #Get information for current gene
            row1 = alv_genes_bin.iloc[x]
            feat_id1 = row1.loc['feat_id']
            name1 = row1.loc['name']

            for x2 in range (len(alv_genes_bin2)):
            
                #Get information for current element
                row2 = alv_genes_bin2.iloc[x2]
                feat_id2 = row2.loc['feat_id']
                name2 = row2.loc['name']

                #Found homology!
                if(name1 == name2):
                    
                    #Add homology to file
                    homology_row = pd.Series({"bin_id": sp, "feat_id": feat_id1, 
                                              "bin_id2": sp2, "feat_id2": feat_id2})
                    
                    alv_prot_ava2 = pd.concat([alv_prot_ava2, pd.DataFrame([homology_row])], ignore_index=True) # spacer
alv_prot_ava2.head()

Unnamed: 0,bin_id,feat_id,bin_id2,feat_id2
0,ZONMW-30,moaE_ZONMW-30,ZONMW-20,moaE_ZONMW-20
1,ZONMW-30,moaA_ZONMW-30,ZONMW-20,moaA_ZONMW-20
2,ZONMW-30,moeA_ZONMW-30,ZONMW-20,moeA_ZONMW-20
3,ZONMW-30,mobA_ZONMW-30,ZONMW-20,mobA_ZONMW-20
4,ZONMW-30,moeA_ZONMW-30,ZONMW-20,moeA_ZONMW-20


In [21]:
alv_prot_ava2 = alv_prot_ava2.drop_duplicates()

In [22]:
alv_prot_ava2.to_csv('../objects/alv_prot_ava2.csv', index = False)

# Decrease distances between elements

Create: alv_ava_blast3 and alv_genes_blast3.  

To improve visualization of the synteny, all large spaces between elements (genes or operons) are to be substituted by a spacer element of X bp surrounded by Y bp.   

The only object to change is alv_all2.csv, all others are to be kept the same.

In [24]:
#Activate print all activities
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [25]:
import pandas as pd

alv_genes2 = pd.read_csv('../objects/alv_genes2.csv', header = 0)
alv_seqs2 = pd.read_csv('../objects/alv_seqs2.csv', header = 0, index_col=False)
alv_genes2.head()
alv_seqs2.head()

Unnamed: 0,bin_id,seq_id,start,end,strand,feat_id,name
0,DD5b,46,0,818,+,sumT_DD5b,sumT
1,DSM1208,1,0,1181,+,moeA_DSM1208,moeA
2,DSM1208,1,1627,2067,+,moaE_DSM1208,moaE
3,DSM1208,1,595603,595845,+,moaD_DSM1208,moaD
4,DSM1208,1,595855,596265,+,moaE_DSM1208,moaE


Unnamed: 0,bin_id,seq_id,length
0,ZONMW-30,4,154620
1,ZONMW-30,7,2428
2,ZONMW-30,13,1690
3,ZONMW-20,10,77708
4,ZONMW-20,17,595


In [26]:
pd.set_option('mode.chained_assignment', None)
import numpy as np

#Numbers below add a really long spacer between genes
#max_dist = 10000
#spacer = 2500
#spacer_width = 1000
#Numbers below offer a better visualization, with almost no spacers between genes
max_dist = 5000
spacer = 500
spacer_width = 250

alv_genes3 = pd.DataFrame(columns=alv_genes2.columns)

#Work through every species and contig in order. If elements are farther away than max_dist, 
# remove the distance and add 2*spacer + spacer_width
for index, row in alv_seqs2.iterrows():

    #Get current bin (species) and seq (contig)
    bin_id = row['bin_id']
    seq_id = row['seq_id']
    
    #Get all elements belonging to seq and bin (genes, operons)
    el = alv_genes2[(alv_genes2['bin_id'] == bin_id) & (alv_genes2['seq_id'] == seq_id)]

    #Put elements in ascending order of start position
    el_ordered = el.sort_values(by='start', ascending=True)
    #Add first row with fake element starting on 1bp (this is to cut the large spaces in the begining)
    #header:
    #bin_id	seq_id	start	end	strand	feat_id	name
    first_row = pd.DataFrame({"bin_id": [bin_id], "seq_id": [seq_id], "start": [1], "end": [100], "strand": ['+'],
                              "feat_id": ['START'], "name": ['START']})
    el_ordered.reset_index(drop=True, inplace=True)
    el_ordered = pd.concat([first_row, el_ordered]).reset_index(drop=True)
    el_ordered.reset_index(drop=True, inplace=True)    
    
    #Compare 1st with second line/element, 2nd with 3rd and so on
    #For every line (gene/operon) of alv_species, re-assign start and end positions whenever necessary
    for i in range (len(el_ordered)):
      
        #Do the comparison only until the last element
        if(i < (len(el_ordered) - 1)):
        
            #Get information for current element
            row1 = el_ordered.iloc[i]
            start1 = row1['end']
        
            #Get information for next element
            row2 = el_ordered.iloc[i + 1]
            start2 = row2['start']
            end2 = row2['end']
            width2 = end2 - start2 + 1
                
            #Check how far elements are from each other (end of element 1 and start of element 2)
            distance = (start2 - start1)
            
            #If too far, include spacer, otherwise keep elements as they are
            if (distance >  max_dist):
                
                #Prepare spacer
                spacer_row = row1.copy()
                spacer_row.loc['start'] = start1 + spacer 
                spacer_row.loc['end'] = start1 + spacer + spacer_width + 1
                spacer_row.loc['strand'] = '+'
                spacer_row.loc['feat_id'] = 'z_spacer'
                spacer_row.loc['width'] = spacer_width 
                spacer_row.loc['name'] = 'z_spacer'
                #Append spacer
                alv_genes3 = pd.concat([alv_genes3, pd.DataFrame([spacer_row])], ignore_index=True) # spacer
                
                #Prepare new row2
                new_row2 = row2                
                new_row2.loc['start'] = start1 + (2 * spacer) + spacer_width + 1
                new_row2.loc['end'] = (start1 + (2 * spacer) + spacer_width + 1) + width2 + 1
                #Append row2
                alv_genes3 = pd.concat([alv_genes3, pd.DataFrame([new_row2])], ignore_index=True) # spacer

                i2 = i + 1
                #Change el_ordered for the changed row permanently
                el_ordered.loc[i2, 'start'] = start1 + (2 * spacer) + spacer_width + 1
                el_ordered.loc[i2, 'end'] = (start1 + (2 * spacer) + spacer_width + 1) + width2 + 1
                
                #Re-set starts and ends of all further elements of the dataframe (genes and operons of this species)
                for i3 in range ((i2+1), len(el_ordered)):
                    
                    el_ordered.loc[i3, 'start'] = (el_ordered.at[i3, 'start']) - distance + 2 * spacer + spacer_width + 1
                    el_ordered.loc[i3, 'end'] = (el_ordered.at[i3, 'end']) - distance + 2 * spacer + spacer_width + 1
            
            else:
                alv_genes3 = pd.concat([alv_genes3, pd.DataFrame([row1])], ignore_index=True)
                alv_genes3 = pd.concat([alv_genes3, pd.DataFrame([row2])], ignore_index=True)

#Remove duplicates since the elements are printed twice (once as row1 and once as row2)
alv_genes3.head()

alv_genes3 = alv_genes3.drop_duplicates()
alv_genes3 = alv_genes3[~alv_genes3['feat_id'].str.contains('START')]
#Remove unnecessary column
alv_genes3 = alv_genes3.drop(['width'], axis=1)

alv_genes3.head()

Unnamed: 0,bin_id,seq_id,start,end,strand,feat_id,name,width
0,ZONMW-30,4,1,100,+,START,START,
1,ZONMW-30,4,0,443,+,moaE_ZONMW-30,moaE,
2,ZONMW-30,4,0,443,+,moaE_ZONMW-30,moaE,
3,ZONMW-30,4,444,1340,+,moaC2_ZONMW-30,moaC2,
4,ZONMW-30,4,444,1340,+,moaC2_ZONMW-30,moaC2,


Unnamed: 0,bin_id,seq_id,start,end,strand,feat_id,name
1,ZONMW-30,4,0,443,+,moaE_ZONMW-30,moaE
3,ZONMW-30,4,444,1340,+,moaC2_ZONMW-30,moaC2
5,ZONMW-30,4,1333,2325,+,moaA_ZONMW-30,moaA
7,ZONMW-30,4,2325,3491,+,moeA_ZONMW-30,moeA
8,ZONMW-30,4,3991,4242,+,z_spacer,z_spacer


In [27]:
alv_genes3.to_csv('../objects/alv_genes3.csv', index = False)

# Decrease global distances of genomes in alv_seqs3

Following on the code block above, I decreased the total genome length to improve visualization.

In [29]:
#Activate print all activities
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [30]:
import pandas as pd

alv_seqs2 = pd.read_csv('../objects/alv_seqs2.csv', header = 0, index_col=False)
alv_genes3 = pd.read_csv('../objects/alv_genes3.csv', header = 0, index_col=False)
alv_seqs2.head()
alv_genes3.head()

Unnamed: 0,bin_id,seq_id,length
0,ZONMW-30,4,154620
1,ZONMW-30,7,2428
2,ZONMW-30,13,1690
3,ZONMW-20,10,77708
4,ZONMW-20,17,595


Unnamed: 0,bin_id,seq_id,start,end,strand,feat_id,name
0,ZONMW-30,4,0,443,+,moaE_ZONMW-30,moaE
1,ZONMW-30,4,444,1340,+,moaC2_ZONMW-30,moaC2
2,ZONMW-30,4,1333,2325,+,moaA_ZONMW-30,moaA
3,ZONMW-30,4,2325,3491,+,moeA_ZONMW-30,moeA
4,ZONMW-30,4,3991,4242,+,z_spacer,z_spacer


In [31]:
alv_seqs3 = alv_seqs2.copy()

for index, row in alv_seqs2.iterrows():

    #Get current bin (species) and seq (contig)
    bin_id = row['bin_id']
    seq_id = row['seq_id']
    
    #Get all elements belonging to seq and bin (genes, operons)
    el = alv_genes3[(alv_genes3['bin_id'] == bin_id) & (alv_genes3['seq_id'] == seq_id)]
    
    #Get maximum values 
    max = el['end'].max()

    #print(seq_id,alv_seqs3.loc[index, 'length'], max)
    
    #Substitute values in alv_seqs3
    alv_seqs3.loc[index, 'length'] = max + 10

In [32]:
alv_seqs3.to_csv('../objects/alv_seqs3.csv', index = False)