# Freeling

fent us de freeling analitzarem els textes

In [1]:
import json
import os
from os import listdir
from os.path import join
import re
import requests

from tqdm.notebook import tqdm

tweetsPath = '../data/tweets'
speechesPath = '../data/speeches'


def freelingAnalizer(text, logs=False):
        if logs:
            print(text)
        headers = {'Content-Type': 'text/plain'}
        
        response = requests.post('http://my-freeling-api:8080', headers=headers, data=text.encode('utf-8'))
        content = response.content.decode('utf-8')
        content = re.sub('\n\n', '\n', content)
        content = re.sub('\n$', '', content)
        
        if logs:
            print(content)
        
        parts = content.split('\n')
        return parts


In [2]:
class Morfo:
    def __init__(self, word, lema, pos, prob):
        self.word = word
        self.lema = lema
        self.pos = pos
        self.prob = prob
        
    def isAdjective(self):
        return self.pos[0] == 'A'
    
    def isConjunction(self):
        return self.pos[0] == 'C'
    
    def isDeterminer(self):
        return self.pos[0] == 'D'
    
    def isNoun(self):
        return self.pos[0] == 'N'
    
    def isPronoun(self):
        return self.pos[0] == 'P'
    
    def isAdverb(self):
        return self.pos[0] == 'R'
    
    def isAdposition(self):
        return self.pos[0] == 'S'
    
    def isVerb(self):
        return self.pos[0] == 'V'
    
    def isNumber(self):
        return self.pos[0] == 'N'
    
    def isDate(self):
        return self.pos[0] == 'D'
    
    def isInterjection(self):
        return self.pos[0] == 'I'
    
    def isPunctuation(self):
        return self.pos[0] == 'F'
    
    def isOther(self):
        return self.pos[0] == 'Z'
    
    def isNonSignificant(self):
        return self.isConjunction() or self.isDeterminer() or self.isPronoun() or self.isAdposition() or self.isOther()
        
    def __str__(self):
        return f'<{self.word}, {self.lema}, {self.pos}, {self.prob}>'
    
    #def toJSON(self):
    #    return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
        
def getMorfo(freelingResponse):
    parts = freelingResponse.split(' ')
    if len(parts) != 4:
        return None
    return Morfo(parts[0] ,parts[1], parts[2], parts[3])

In [3]:
freelingResponse = freelingAnalizer('Todos los niños nacen artistas.')
print(freelingResponse)

morfos = [getMorfo(fr) for fr in freelingResponse]
for m in morfos:
    print(m)

['Todos todo DI0MP0 0.70665', 'los el DA0MP0 0.992728', 'niños niño NCMP000 0.998721', 'nacen nacer VMIP3P0 1', 'artistas artista NCCP000 0.992424', '. . Fp 1']
<Todos, todo, DI0MP0, 0.70665>
<los, el, DA0MP0, 0.992728>
<niños, niño, NCMP000, 0.998721>
<nacen, nacer, VMIP3P0, 1>
<artistas, artista, NCCP000, 0.992424>
<., ., Fp, 1>


In [4]:
def freelingGetMorfos(text):
    freelingResponse = freelingAnalizer(text)
    morfos = [getMorfo(fr) for fr in freelingResponse]
    return morfos

morfos = freelingGetMorfos('Todos los niños nacen artistas.')
for m in morfos:
    print(m)

<Todos, todo, DI0MP0, 0.70665>
<los, el, DA0MP0, 0.992728>
<niños, niño, NCMP000, 0.998721>
<nacen, nacer, VMIP3P0, 1>
<artistas, artista, NCCP000, 0.992424>
<., ., Fp, 1>


In [5]:
tweetsPaths = [join(tweetsPath, f) for f in listdir(tweetsPath)]
speechesPaths = [join(speechesPath, f) for f in listdir(speechesPath)]

files = tweetsPaths + speechesPaths
files[0:5] + ['...'] + files[-6:-1]

['../data/tweets/1201675455042072578.json',
 '../data/tweets/1201710024931840000.json',
 '../data/tweets/1201717491141136390.json',
 '../data/tweets/1201718618737786882.json',
 '../data/tweets/1201724641959665664.json',
 '...',
 '../data/speeches/17212-DSCD-14-PL-002.json',
 '../data/speeches/17213-DSCD-14-PL-002.json',
 '../data/speeches/17214-DSCD-14-PL-002.json',
 '../data/speeches/17215-DSCD-14-PL-002.json',
 '../data/speeches/17216-DSCD-14-PL-002.json']

In [6]:
len(files)

61743

In [7]:
def dumper(obj):
    try:
        return obj.toJSON()
    except:
        return obj.__dict__

def analize(path):    
    with open(path, 'r') as f:
        stg_json = f.read()
        data = json.loads(stg_json)
        
    if 'freeling' in data:
        return
    
    text = data['text']
    if not re.search('[\.\!\?]$', text):
        text += '.'
    
    morfos = freelingGetMorfos(text)    
    
    data['freeling'] = morfos
    with open(path, 'w') as f:
        json.dump(data, f, default=dumper, indent=4)

        
for file in tqdm(files):
    analize(file)

  0%|          | 0/61743 [00:00<?, ?it/s]

In [13]:
fails = []
for file in tqdm(files):
     with open(file, 'r') as f:
        stg_json = f.read()
        data = json.loads(stg_json)
        fl = data['freeling']
        if len(fl) == 0:
            fails.append(file)

for fail in fails:
    print(fail)

  0%|          | 0/61743 [00:00<?, ?it/s]