In [14]:
import numpy as np
import pandas as pd
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

import asyncio
from aiohttp import ClientSession
from functools import partial
from aiohttp import ClientConnectorError
import re
from itertools import chain
import os
import pickle

# Instantiates a client
client = language.LanguageServiceClient()

In [2]:
dat1 = pd.read_pickle("../data/elmondo_es.pkl")
dat2 = pd.read_pickle("../data/elmondo_es_eco.pkl")
dat3 = pd.read_pickle("../data/elmondo_es_sp.pkl")
dat4 = pd.read_pickle("../data/cnn_es.pkl")
dat = pd.concat([dat1, dat2, dat3, dat4], axis=0, ignore_index=True)
del dat1, dat2, dat3, dat4

In [3]:
dat.head()

Unnamed: 0,headline,keyfacts,content,tags,time
0,Una jueza del Tribunal Supremo suspende parcia...,[ Decidió atender a los grupos conservadores y...,"El martes, antes de presidir la fiesta de fin ...",[],2014-01-02
1,'La revolución cubana sigue sin compromisos co...,[ 'Jamás hemos cedido ni cederemos ante agresi...,El presidente Raúl Castro reveló que se está i...,[],2014-01-02
2,La NSA trabaja en un ordenador cuántico capaz ...,[ La información proviene de los documentos de...,La Agencia de Seguridad Nacional (NSA) trabaja...,[],2014-01-03
3,Último adiós a la ex Miss Venezuela Mónica Spe...,[ Mónica Spear y su marido fueron asesinados e...,Esta semana Venezuela ha recibido una noticia ...,[],2014-01-10
4,Michoacán pone en jaque al Gobierno de Peña Nieto,[ El Gobierno envía más policías y militares y...,La situación en el Estado mexicano de Michoacá...,[],2014-01-14


In [4]:
dat.shape

(37702, 5)

In [5]:
def has_proper_mentions(mentions):
    return np.any([mention.type == 1 for mention in mentions])

def process_response(response):
    entities = response.entities
    return [{'entity_name': ent.name,
             'entity_type': entity_type[ent.type], 
            'mentions': list(set(mention.text.content
                        for mention in ent.mentions if mention.type == 1))} 
            for ent in entities if has_proper_mentions(ent.mentions)]

entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
               'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')

mention_type = ('TYPE_UNKNOWN', 'PROPER', 'COMMON')

async def get_entity_data(idx, session):
    attempts = 0
    while attempts < 3:
        try:
            # The text to analyze
            text = "\n".join(dat['keyfacts'].iloc[idx]) + '\n\n' + dat['content'].iloc[idx]

            document = types.Document(
                content=text,
                type=enums.Document.Type.PLAIN_TEXT)

            # Detects the sentiment of the text
            response = client.analyze_entities(document=document)
            out = process_response(response)
            return idx, out
        except ClientConnectorError:
            attempts += 1
#             print("Connector error occurred!")
    if attempts == 3:
        print("Connector error occurred! Connection Failed!")
        return idx, None

async def gather_results(curr, step):
    """Launch scrape tasks and collect results"""
    tasks = []
    async with ClientSession() as session:
        for idx in range(curr, curr + step):
            task = asyncio.ensure_future(get_entity_data(idx, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        # you now have all response bodies in this variable
        return responses


def process_df(future, curr, step):
    """Save scrape results in json files"""
    cache = {k: v for k, v in future.result()}
#     cache = future.result()
    if len(cache) == 0:
        raise RuntimeError("Empty response!")
    else:
        json.dump(cache, open("../data/mondo_entities/entities_{0}_{1}.json".format(curr, curr + step), "w"))
#         print(cache)
        print("got it! ({0}, {1})".format(curr, curr + step))

In [7]:
start = 37700
end = 37702
step = 100
abandoned = []

for curr in range(start, end, step):
    inc = min(step, end - start)
    print("loading data from {0} to {1}".format(curr, curr + inc))
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(gather_results(curr, inc))
    future.add_done_callback(partial(process_df, curr=curr, step=inc))
    loop.run_until_complete(future)

loading data from 37700 to 37702
got it! (37700, 37702)


In [8]:
full_es = dat.to_pickle("../data/full_es.pkl")

In [9]:
ENTITY_ROOT = "../data/mondo_entities/"
fn = os.listdir(ENTITY_ROOT)

In [10]:
entities = dict()
for file in fn:
    with open(os.path.join(ENTITY_ROOT, file), "r") as f:
        entities.update(json.load(f))

In [15]:
pickle.dump(entities, open("../data/es_processed_entities.pkl", "wb"))