In [1]:
from os import walk
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

In [2]:
path = 'S:\ebao\ABSA\Data\OpeNER\opinion_annotations_en\kaf\hotel/'
files = [filename for (dirpath, dirnames, filename) in walk(path)][0]

In [3]:
def get_root(file):
    root = ET.parse(file).getroot()
    return root

In [4]:
def get_text(root):
    text = [e.text for c in root.getchildren() for e in c.findall('wf') if c.tag == 'text']
    return text

In [5]:
def get_term(root):
    """return map between tid and wid"""
    terms = [e for c in root.getchildren() for e in c.findall('term') if c.tag == 'terms']
    tw_map = {x.findall('span')[0].findall('target')[0].attrib['id']: x.attrib['tid'] for x in terms}
    return tw_map

In [6]:
def get_opinions(root):
    """Return target, expression, polarity"""
    opinions = [e for c in root.getchildren() for e in c.findall('opinion') if c.tag == 'opinions']
    triples = []
    for opinion in opinions:
        try:
            targets = [o.findall('span')[0].findall('target') for o in opinion.getchildren() if o.tag == 'opinion_target'][0]
            t_id = [t.attrib['id'] for t in targets]
        except IndexError:
            t_id = [None]
        exps = [e.findall('span')[0].findall('target') for e in opinion.getchildren() if e.tag == 'opinion_expression'][0]
        e_id = [e.attrib['id'] for e in exps]
        polarity = [e.attrib['polarity'] for e in opinion.getchildren() if e.tag == 'opinion_expression'][0]
        triples.append((t_id,e_id,polarity))
    return triples

In [7]:
def id2words(opinions, root):
    text = get_text(root)
    w_opins = []
    for o in opinions:
        target = [w for w in map(lambda x: text[int(x.split('t')[-1])-1] if x else None,o[0])]
        exp = [w for w in map(lambda x: text[int(x.split('t')[-1])-1],o[1])]
        polarity = o[2]
        if target[0]: 
            w_opins.append([' '.join(target),' '.join(exp),polarity])
        else:
            w_opins.append([None,' '.join(exp),polarity])
    return np.array(w_opins)

In [8]:
def extract_data(file,colnames=['TARGET','OTE','POLARITY']):
    
    print(file)
    root = get_root(file)
    opinions_id = get_opinions(root)
    opinions = id2words(opinions_id,root)
    df = pd.DataFrame(opinions, columns=colnames)
    
    return df

In [9]:
def iter_data(file,colnames=['TARGET','OTE','POLARITY']):
    
    #print(file)
    root = get_root(file)
    opinions_id = get_opinions(root)
    if opinions_id:
        opinions = id2words(opinions_id,root)
        df = pd.DataFrame(opinions, columns=colnames)
        yield df
    else:
        pass

In [10]:
df = pd.concat([df for file in files for df in iter_data(path+file) ], axis=0, ignore_index=True)

In [20]:
df.to_csv('S:\ebao\ABSA\Data\OpeNER\OpeNER_TOP.csv', index=False, encoding='utf-8')

In [11]:
df.shape

(4150, 3)

In [12]:
df.dropna().shape

(3850, 3)

In [13]:
df[df.TARGET.isnull()]

Unnamed: 0,TARGET,OTE,POLARITY
33,,Will definitely be retuning,StrongPositive
41,,"walking distance to Sol , Gran Via etc.",Positive
63,,are satisfied,Positive
83,,with a shopping center on the other side of th...,Positive
84,,Close to the beach,Positive
85,,Very good,StrongPositive
101,,Practical,Positive
110,,can easily find connections to every tourist a...,Positive
129,,would love to go back,Positive
130,,One of the best holidays,StrongPositive


In [15]:
df.OTE.isnull().any()

False

In [16]:
df.POLARITY.isnull().any()

False

----
# Debugging Section


In [114]:
extract_data('S:\ebao\ABSA\Data\OpeNER\opinion_annotations_en\kaf\hotel/english00002_0685261321182f93763efabe4099a840.kaf')

S:\ebao\ABSA\Data\OpeNER\opinion_annotations_en\kaf\hotel/english00002_0685261321182f93763efabe4099a840.kaf


Unnamed: 0,TARGET,OTE,POLARITY
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,True,False,False
5,False,False,False
6,False,False,False


In [93]:
f = 'S:\ebao\ABSA\Data\OpeNER\opinion_annotations_en\kaf\hotel/english00002_0685261321182f93763efabe4099a840.kaf'

In [94]:
root = get_root(f)

In [95]:
' '.join(get_text(root))

'Nicest hotel ever ! This was my favourite holiday place I have ever been to ! Will definitely be retuning , advise people to go here for a great holiday ! People are great ! Staff are great and the whole holiday package was fantastic !'

In [96]:
if get_opinions(root):
    print(True)
else:
    print(False)

True


In [97]:
opinions = [e for c in root.getchildren() for e in c.findall('opinion') if c.tag == 'opinions']
opinions

[<Element 'opinion' at 0x000000603EE4CB88>,
 <Element 'opinion' at 0x000000603EE4C598>,
 <Element 'opinion' at 0x000000603EE4C908>,
 <Element 'opinion' at 0x000000603EE4C5E8>,
 <Element 'opinion' at 0x000000603EE4C728>,
 <Element 'opinion' at 0x000000603EAF1868>,
 <Element 'opinion' at 0x000000603EAF16D8>]

In [102]:
try:
    targets = [o.findall('span')[0].findall('target') for o in opinions[4].getchildren() if o.tag == 'opinion_target'][0]
except IndexError:
    t_id = [None]

In [104]:
if t_id[0]:
    print(True)

In [95]:
root.getchildren()[-1].getchildren()[0].getchildren()[1].findall('span')[0].attrib

{}

In [80]:
[o for o in root.getchildren()[-1].getchildren() if o.tag == 'opinion_target']

[]

In [None]:
def get_opinions(root):
    """Return target, expression, polarity"""
    opinions = [e for c in root.getchildren() for e in c.findall('opinion') if c.tag == 'opinions']
    triples = []
    for opinion in opinions:
        targets = [o.findall('span')[0].findall('target') for o in opinion.getchildren() if o.tag == 'opinion_target'][0]
        t_id = [t.attrib['id'] for t in targets]
        exps = [e.findall('span')[0].findall('target') for e in opinion.getchildren() if e.tag == 'opinion_expression'][0]
        e_id = [e.attrib['id'] for e in exps]
        polarity = [e.attrib['polarity'] for e in opinion.getchildren() if e.tag == 'opinion_expression'][0]
        triples.append((t_id,e_id,polarity))
    return triples