# Replace non utf-8 charaters from Araport 11 GFF, Ensure "attributes" column is uncorrupted

## Description
GFF as downloaded from Araport contains nasty (non UTF-8) symbols, and ";" in arritubute values. 



Here, we remove (replace where possible) them, to prevent downstream software freaking out. 

## Library import
We import all the required Python libraries

In [1]:
import codecs, gzip, shutil

# IO
from pathlib import Path

# Test results with BedTools
from pybedtools import BedTool
from pybedtools.helpers import cleanup as bedtools_cleanup

## Path definitions

In [2]:
base_dir = Path("./")

In [3]:
tair_gff = base_dir  / "Araport11_GFF3_genes_transposons.current.gff.gz"
new_tair_gff = base_dir  / "Araport11_GFF3_genes_transposons.current_utf8.gff"

## Replacement dictionary
Manually created based on symbols present in Araport11 GFF. 
Not infallible.

In [4]:
utf8_replacements = {
    b"\xe2\x8f\x91": b"Delta9",       # Δ9
    b" \xf0\x95\x91\x85": b"Delta9",  # Δ9
   
    b"\xf0\x95\x92\xb6?": b"-2'", # −2 and a prime?
    b"\xe2\x93\xb5": b"Beta1",    # β1,
    b'\xe2\x88\x92' :b"-",        # −
    b"\xe2\x80\xb2": b"'",        # ′
    b"\xe2\x86\x92": b" to ",     # →
    b"\xf0\x98\xb0": b" to ",     # ➝
   
    b"\xae":b"fi",
    b'\xce\xb1': b'Alpha',   # α
    b"\xce\xb2": b"Beta",    # β
    b"\xe2\x80\xb2": b"'",   # ′
    b"\xce\xb3": b"gamma",   # γ
    b"\xce\xb4": b"delta",   # δ
    b"\xce\xb5": b"epsilon", # ε 
    b"\xce\xb6": b"zeta",    # ζ 
    b"\xcf\x89": b"omega",   # ζ 

    b'\xa1\xaf' :b"'",       # ¡¯
    b"\x96": b"-",           # –
    b"\x91": b"'",           # ‘
    b"\xb0": b' degrees ',   # °
    b"\xdf": b'Beta',        # ß
    b'\xb1': b"+/-",         # ±
    b'\x92': b"'",           # ’
    b'\xd7': b"x",           # × 
    b'\xb5': b"mu" ,	     # µ
    b'\xad' :b"",		     # ­ (sort-hyphen)
    b'\xa0' :b" ",	         #  
    b'\xc4' :b"",	         # Ä
    b"\xa5": b"3", 
    b"\xa6": b"4",
    b'\xe9' :b"e",           # é
}

# Read and rewrite


## Cleaning functions


In [7]:
def try_utf8(data):
    try:
        return data.decode('utf-8')
    except UnicodeDecodeError:
        return None

def clean_line(line):
    for key in utf8_replacements.keys():
        line = line.replace(key, utf8_replacements[key])
    return line


araport_attribute_names = [b'ID', b'Name', b'full_name', b'computational_description', b'locus', b'locus_type', b'symbol', b'Note', b'Dbxref']
def clean_attributes(line):
    rest_of_line, attributes = line.rsplit(b"\t", 1)
    to_replace = []
    for i, c in enumerate(attributes):
        if bytes(bytes([c])) == b";":
            next_text = attributes[i+1:].strip()
            next_text_okay = False
            for n in araport_attribute_names:
                if next_text.startswith(n):
                    next_text_okay = True
                    break
            
            if not next_text_okay:
                # print(next_text, attributes[i-5:i+5])
                to_replace.append(i)
            
        # print(i, bytes([c]), attributes[i-5:i+5])

    attributes = bytearray(attributes)
    for i in to_replace[::-1]:
        attributes[i:i+1] = b","

    line = rest_of_line + b"\t" + attributes

    return line

In [8]:
not_utf8 = set()
with gzip.open(tair_gff, 'rb') as f_in:
    with open(new_tair_gff, 'wb') as f_out:
        for line in f_in:
            if (line.strip()==b"") or line.startswith(b"#"):
                f_out.write(line.decode('utf-8','ignore').encode("utf-8"))
            else:
                cleaned_line = clean_line(line)
                data = try_utf8(cleaned_line)
                if data is None: 
                    not_utf8.update(set(line.decode('latin-1')) - set(line.decode('utf-8','ignore').encode("utf-8").decode('latin-1')))
    
                cleaned_line = clean_attributes(cleaned_line)
                    
                f_out.write(cleaned_line.decode('utf-8','ignore').encode("utf-8"))

In [9]:
# If not empty, use to manually define in utf8_replacements 
for x in not_utf8:
    print(x.encode("latin1"), ":  ,\t\t #", x)

In [10]:
with open(new_tair_gff, 'rb') as f_in:
    with gzip.open(f'{new_tair_gff}.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Test with bedtools

_not needed to run_

";" in attributes makes filtering on attributes not possible, See if we caught them all... 

In [11]:
bedtools_cleanup()

In [12]:
araport_gff = BedTool(new_tair_gff)
# araport_gff = araport_gff.sort()
# len(araport_gff)

In [13]:
list_of_ids = ['AT1G28450', 'AT1G28460']

In [14]:
# prints "bad" features and stops, use to update note_replacement_values

def name_filter(feature):
    try:
        return feature.attrs["ID"] in list_of_ids
    except:
        print(feature)
        raise
    

araport_gff.filter(name_filter).saveas()

<BedTool(/tmp/pybedtools.7ftve8z5.tmp)>

# Make gene only subset

Can also do with bedtools. 

In [15]:
gene_tair_gff = base_dir  / "Araport11_GFF3_genes_transposons.current_utf8_genes.gff"

In [16]:
gene_count = 0
with open(new_tair_gff, 'rb') as f_in:
    with open(gene_tair_gff, 'wb') as f_out:
        for line in f_in:
            if line.startswith(b"#"):
                f_out.write(line)
                continue
            for line in f_in:
                if line.split(b"\t")[2] == b"gene":             
                    f_out.write(line)
                    gene_count += 1
print(gene_count)

33243


In [17]:
with open(gene_tair_gff, 'rb') as f_in:
    with gzip.open(f'{gene_tair_gff}.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

## Delete noncompressed files

In [18]:
new_tair_gff.unlink()
gene_tair_gff.unlink()