# transform `pseudogenes.csv` in gff3 

This is needed to standardise the addition of attributes at the end of the screening for core and dispensable genes, so that one of the input will be always an annotation file in gff3 format (ncbi annotations are in gff3)

In [45]:
import os
os.chdir('/home/lia/A_pangenome')
import pandas as pd

In [46]:
!head pseudogenes.csv | column -t

Chromosome  start   end     coverage  Genewise_score  position    size  Strand  Type  Pater        Species     Stop_codon  Frameshift
Chr1        6144    6269    0.09      56.39           UTR         129   plus    FRAG  AT1G01010.1  A.thaliana  0           0
Chr1        47087   47325   0.06      74.09           Intergenic  206   plus    FRAG  AT1G02730.1  A.thaliana  1           3
Chr1        89932   90153   0.02      24.75           Intergenic  51    plus    FRAG  AT5G41740.2  A.thaliana  1           3
Chr1        259011  259151  0.06      48.95           Intergenic  84    plus    FRAG  AT1G01690.1  A.thaliana  0           0
Chr1        266562  267495  0.99      68.96           UTR         979   plus    DUP   AT4G00525.1  A.thaliana  0           0
Chr1        426018  426176  0.06      17.28           Intergenic  36    plus    FRAG  AT3G04430.1  A.thaliana  1           1
Chr1        578305  578484  0.04      47.13           Intron_cds  102   plus    FRAG  AT1G05120.1  A.thaliana  0    

In [47]:
df = pd.read_csv('pseudogenes.csv', sep='\t')
df.head()

Unnamed: 0,Chromosome,start,end,coverage,Genewise_score,position,size,Strand,Type,Pater,Species,Stop_codon,Frameshift
0,Chr1,6144,6269,0.09,56.39,UTR,129,plus,FRAG,AT1G01010.1,A.thaliana,0,0
1,Chr1,47087,47325,0.06,74.09,Intergenic,206,plus,FRAG,AT1G02730.1,A.thaliana,1,3
2,Chr1,89932,90153,0.02,24.75,Intergenic,51,plus,FRAG,AT5G41740.2,A.thaliana,1,3
3,Chr1,259011,259151,0.06,48.95,Intergenic,84,plus,FRAG,AT1G01690.1,A.thaliana,0,0
4,Chr1,266562,267495,0.99,68.96,UTR,979,plus,DUP,AT4G00525.1,A.thaliana,0,0


In [48]:
df["source"] = "bed_to_gff3"
df["type"] = "pseudogene"
df["score"] = "."
df["score2"] = "."
df

Unnamed: 0,Chromosome,start,end,coverage,Genewise_score,position,size,Strand,Type,Pater,Species,Stop_codon,Frameshift,source,type,score,score2
0,Chr1,6144,6269,0.09,56.39,UTR,129,plus,FRAG,AT1G01010.1,A.thaliana,0,0,bed_to_gff3,pseudogene,.,.
1,Chr1,47087,47325,0.06,74.09,Intergenic,206,plus,FRAG,AT1G02730.1,A.thaliana,1,3,bed_to_gff3,pseudogene,.,.
2,Chr1,89932,90153,0.02,24.75,Intergenic,51,plus,FRAG,AT5G41740.2,A.thaliana,1,3,bed_to_gff3,pseudogene,.,.
3,Chr1,259011,259151,0.06,48.95,Intergenic,84,plus,FRAG,AT1G01690.1,A.thaliana,0,0,bed_to_gff3,pseudogene,.,.
4,Chr1,266562,267495,0.99,68.96,UTR,979,plus,DUP,AT4G00525.1,A.thaliana,0,0,bed_to_gff3,pseudogene,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4670,Chr5,610030,610113,0.04,15.65,Intron_cds,51,minus,FRAG,AT3G28410.1,A.thaliana,0,0,bed_to_gff3,pseudogene,.,.
4671,Chr5,489952,491331,0.11,91.26,Intergenic,1350,minus,DUP,AT5G02320.1,A.thaliana,0,0,bed_to_gff3,pseudogene,.,.
4672,Chr5,240395,240439,0.03,52.75,Intron_cds,105,minus,FRAG,AT4G00020.2,A.thaliana,0,0,bed_to_gff3,pseudogene,.,.
4673,Chr5,238426,238539,0.02,38.62,Intron_cds,69,minus,FRAG,AT4G00020.2,A.thaliana,0,0,bed_to_gff3,pseudogene,.,.


In [49]:
# Rearrange the columns (not strictly necessary, but I like to have the assembly_name column first)
# Get a list of all the column names
cols = list(df.columns)
# Remove 'assembly_name' from the list
cols.remove('Chromosome')
cols.remove('source')
cols.remove('type')
cols.remove('start')
cols.remove('end')
cols.remove('score')
cols.remove('Strand')
cols.remove('score2')
# Create a new list with 'assembly_name' first, followed by the other columns
cols = ['Chromosome', 'source', 'type', 'start', 'end', 'score', 'Strand', 'score2'] + cols
# Reorder the DataFrame
df = df[cols]
df.head()

Unnamed: 0,Chromosome,source,type,start,end,score,Strand,score2,coverage,Genewise_score,position,size,Type,Pater,Species,Stop_codon,Frameshift
0,Chr1,bed_to_gff3,pseudogene,6144,6269,.,plus,.,0.09,56.39,UTR,129,FRAG,AT1G01010.1,A.thaliana,0,0
1,Chr1,bed_to_gff3,pseudogene,47087,47325,.,plus,.,0.06,74.09,Intergenic,206,FRAG,AT1G02730.1,A.thaliana,1,3
2,Chr1,bed_to_gff3,pseudogene,89932,90153,.,plus,.,0.02,24.75,Intergenic,51,FRAG,AT5G41740.2,A.thaliana,1,3
3,Chr1,bed_to_gff3,pseudogene,259011,259151,.,plus,.,0.06,48.95,Intergenic,84,FRAG,AT1G01690.1,A.thaliana,0,0
4,Chr1,bed_to_gff3,pseudogene,266562,267495,.,plus,.,0.99,68.96,UTR,979,DUP,AT4G00525.1,A.thaliana,0,0


In [50]:
# Specify the columns to be joined
cols_to_join = ['coverage', 'Genewise_score', 'position', 'size', 'Type', 'Pater', 'Species', 'Stop_codon', 'Frameshift']

# Create the new column
df['attributes'] = df[cols_to_join].apply(lambda row: ';'.join(f'{col}={row[col]}' for col in cols_to_join), axis=1)

# Drop the original columns
df = df.drop(columns=cols_to_join)

# substitute plus and minus with + and -
df['Strand'] = df['Strand'].replace({'plus': '+', 'minus': '-'})

#change the name of the chromosome to adapt them to those used in the pangenome
df['Chromosome'] = df['Chromosome'].replace({'Chr1': 'CP002684.1', 'Chr2': 'CP002685.1' , 'Chr3': 'CP002686.1', 'Chr4': 'CP002687.1', 'Chr5': 'CP002688.1'})

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['attributes'] = df[cols_to_join].apply(lambda row: ';'.join(f'{col}={row[col]}' for col in cols_to_join), axis=1)


Unnamed: 0,Chromosome,source,type,start,end,score,Strand,score2,attributes
0,CP002684.1,bed_to_gff3,pseudogene,6144,6269,.,+,.,coverage=0.09;Genewise_score=56.39;position=UT...
1,CP002684.1,bed_to_gff3,pseudogene,47087,47325,.,+,.,coverage=0.06;Genewise_score=74.09;position=In...
2,CP002684.1,bed_to_gff3,pseudogene,89932,90153,.,+,.,coverage=0.02;Genewise_score=24.75;position=In...
3,CP002684.1,bed_to_gff3,pseudogene,259011,259151,.,+,.,coverage=0.06;Genewise_score=48.95;position=In...
4,CP002684.1,bed_to_gff3,pseudogene,266562,267495,.,+,.,coverage=0.99;Genewise_score=68.96;position=UT...


In [51]:
print(df['Chromosome'].unique())

['CP002684.1' 'CP002685.1' 'CP002686.1' 'CP002687.1' 'CP002688.1']


In [43]:
!head GCA_000001735.2_TAIR10.1_genomic.gff | column -t

##gff-version             3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
#!gff-spec-version        1.21                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [52]:
df.to_csv('pseudogenes.gff', sep='\t', index=False, header=False)

In [53]:
!head pseudogenes.gff | column -t

CP002684.1  bed_to_gff3  pseudogene  6144    6269    .  +  .  coverage=0.09;Genewise_score=56.39;position=UTR;size=129;Type=FRAG;Pater=AT1G01010.1;Species=A.thaliana;Stop_codon=0;Frameshift=0
CP002684.1  bed_to_gff3  pseudogene  47087   47325   .  +  .  coverage=0.06;Genewise_score=74.09;position=Intergenic;size=206;Type=FRAG;Pater=AT1G02730.1;Species=A.thaliana;Stop_codon=1;Frameshift=3
CP002684.1  bed_to_gff3  pseudogene  89932   90153   .  +  .  coverage=0.02;Genewise_score=24.75;position=Intergenic;size=51;Type=FRAG;Pater=AT5G41740.2;Species=A.thaliana;Stop_codon=1;Frameshift=3
CP002684.1  bed_to_gff3  pseudogene  259011  259151  .  +  .  coverage=0.06;Genewise_score=48.95;position=Intergenic;size=84;Type=FRAG;Pater=AT1G01690.1;Species=A.thaliana;Stop_codon=0;Frameshift=0
CP002684.1  bed_to_gff3  pseudogene  266562  267495  .  +  .  coverage=0.99;Genewise_score=68.96;position=UTR;size=979;Type=DUP;Pater=AT4G00525.1;Species=A.thaliana;Stop_codon=0;Frameshift=0
CP002684.1  bed_to_gff

In [54]:
!tail pseudogenes.gff | column -t

CP002688.1  bed_to_gff3  pseudogene  1341997  1342098  .  -  .  coverage=0.04;Genewise_score=19.96;position=Intergenic;size=51;Type=FRAG;Pater=AT3G23280.1;Species=A.thaliana;Stop_codon=0;Frameshift=0
CP002688.1  bed_to_gff3  pseudogene  1031516  1032032  .  -  .  coverage=0.4;Genewise_score=221.81;position=Intergenic;size=460;Type=FRAG;Pater=AT5G03960.1;Species=A.thaliana;Stop_codon=3;Frameshift=4
CP002688.1  bed_to_gff3  pseudogene  911089   911463   .  -  .  coverage=0.99;Genewise_score=90.11;position=Intergenic;size=379;Type=SE;Pater=AT5G38440.1;Species=A.thaliana;Stop_codon=0;Frameshift=0
CP002688.1  bed_to_gff3  pseudogene  706016   706120   .  -  .  coverage=0.29;Genewise_score=67.74;position=Intergenic;size=99;Type=FRAG;Pater=AT5G03030.1;Species=A.thaliana;Stop_codon=0;Frameshift=0
CP002688.1  bed_to_gff3  pseudogene  614386   614598   .  -  .  coverage=0.31;Genewise_score=89.77;position=UTR;size=207;Type=FRAG;Pater=AT3G53490.1;Species=A.thaliana;Stop_codon=0;Frameshift=0
CP0026