In [1]:
import ijson # $pip install ijson
import pandas as pd
import numpy as np

from nltk import word_tokenize, pos_tag, wordnet
from nltk.stem.wordnet import WordNetLemmatizer

import zenodotus as z
import noun_phrase_extractor as npe

# Importando dados de origem

In [2]:
#Leitura do DataFrame
datapath = "raw.json"

sentences = []

with open(datapath, 'r') as f:
    objects = ijson.items(f, 'topics.item.subject') #Pulo do gato: item:são os elementos de uma lista
    sentences = list(objects)

    
with open(datapath, 'r') as f:
    objects2 = ijson.items(f, 'topics.item.question')
    sentences.extend(list(objects2))

    
source  = pd.DataFrame({'sentence':sentences})

In [3]:
source.head()

Unnamed: 0,sentence
0,Bad rap actors good rap actors list?
1,list of hispanic actors?
2,LiST OF ACTORS AND ACTRESSES?
3,Who are a and b list actors?
4,List of famous black actors?


In [4]:
#existem dados faltantes?
source[source['sentence']==''].count()

sentence    21
dtype: int64

In [5]:
#removendo...
source = source[source['sentence']!='']
#depois da remoção
source[source['sentence']==''].count()

sentence    0
dtype: int64

In [10]:
#tokenização
source['tokenized'] = source.apply(lambda row: word_tokenize(row['sentence'].lower()), axis=1)

In [11]:
source.head()

Unnamed: 0,sentence,tokenized,tagged,noun-phrases
0,Bad rap actors good rap actors list?,"[bad, rap, actors, good, rap, actors, list, ?]","[(Bad, NNP), (rap, NN), (actor, NNS), (good, J...","[[bad, rap, actor, good, rap, actor, list]]"
1,list of hispanic actors?,"[list, of, hispanic, actors, ?]","[(list, NN), (of, IN), (hispanic, JJ), (actor,...","[[list], [hispanic, actor]]"
2,LiST OF ACTORS AND ACTRESSES?,"[list, of, actors, and, actresses, ?]","[(LiST, NN), (OF, IN), (ACTORS, NNP), (AND, NN...","[[list, act, ors and actresses?]]"
3,Who are a and b list actors?,"[who, are, a, and, b, list, actors, ?]","[(Who, WP), (are, VBP), (a, DT), (and, CC), (b...","[[list, actor]]"
4,List of famous black actors?,"[list, of, famous, black, actors, ?]","[(List, NN), (of, IN), (famous, JJ), (black, J...","[[list], [famous, black, actor]]"


In [12]:
source['tagged'] = source.apply(lambda row: pos_tag(row['tokenized']),axis=1)

#Lematiza somente a o token, mantendo a tag original do token antes da lematização (actors, NNS) -> (actor, NNS)
Lemmatizer = WordNetLemmatizer()
for i in source['tagged'].index:
    lista_aux = []
    for tupla in source['tagged'][i]:
        lista_aux.append( (Lemmatizer.lemmatize(tupla[0]), tupla[1]) )
    source['tagged'][i] = lista_aux
    
source['noun-phrases'] = source.apply(lambda row: npe.extract(row['sentence']), axis=1)

In [13]:
source.head()

Unnamed: 0,sentence,tokenized,tagged,noun-phrases
0,Bad rap actors good rap actors list?,"[bad, rap, actors, good, rap, actors, list, ?]","[(bad, JJ), (rap, NN), (actor, NNS), (good, JJ...","[[bad, rap, actor, good, rap, actor, list]]"
1,list of hispanic actors?,"[list, of, hispanic, actors, ?]","[(list, NN), (of, IN), (hispanic, JJ), (actor,...","[[list], [hispanic, actor]]"
2,LiST OF ACTORS AND ACTRESSES?,"[list, of, actors, and, actresses, ?]","[(list, NN), (of, IN), (actor, NNS), (and, CC)...","[[list, act, ors and actresses?]]"
3,Who are a and b list actors?,"[who, are, a, and, b, list, actors, ?]","[(who, WP), (are, VBP), (a, DT), (and, CC), (b...","[[list, actor]]"
4,List of famous black actors?,"[list, of, famous, black, actors, ?]","[(list, NN), (of, IN), (famous, JJ), (black, J...","[[list], [famous, black, actor]]"


In [14]:
words = []
for i,row in source.iterrows():
    for phrase in row['noun-phrases']:
        for word in phrase:
            words.append(word)
nellkb = pd.DataFrame({'word':words})

#Removendo repetições
nellkb = nellkb.drop_duplicates()

In [15]:
nellkb['is_category'] = nellkb.apply(lambda row: z.isCategory(row['word']),axis=1)

In [None]:
categories = nellkb[nellkb['is_category'] == 'yes']['word'].tolist()
nellkb['is_category'].value_counts()

In [None]:
#
for irow, row in source.iterrows():
    print(source['sentence'][irow])
    #Gerando um dataframe provisório para busca nos dados
    tagged_data_frame = pd.DataFrame(source['tagged'][irow],columns=['word','tag'])
    
    #transformando em minúsculas
    tagged_data_frame['word'] = tagged_data_frame.apply(lambda row: row['word'].lower(),axis=1)
    
    #print(tagged_data_frame.head())
       
    for iphrase, phrase in enumerate(row['noun-phrases']):
        for iword, word in enumerate(phrase):
            print(word in tagged_data_frame['word'])

In [None]:
#data = {'sentence':[],'word':[],'word_tag':[],'word_is_category':[]
#                          ,'nell_category':[],'nell_category_tag':[],'distance_from_category':[]
#                         ,'number_of_occurrences':[],'is_category_candidate':[]}
dataset = pd.DataFrame()
for irow, row in source.iterrows():
    for iphrase, phrase in enumerate(row['noun-phrases']):
        print(irow,iphrase,phrase)
        is_category = []
        #Buscando categorias na base da NELL e Gerando o Dataframe
        for word in phrase:
            cat = z.getCategory(word)
            #Verifica se a palavra é uma categoria da NELL
            if(cat['category_name'] is not None): 
                is_category.append('yes')
            else:
                is_category.append('no')
        df = pd.DataFrame({'word':phrase, 'is_category':is_category})
        words_list = []
        categories = []
        for category in df[df['is_category'] == 'yes']['word'].items():
            for word in df[df['word'] != category[1]]['word'].items():
                words_list.append(word[1])
                categories.append(category[1])
        dfr = pd.DataFrame({'word':words_list, 'category':categories})
        dataset = pd.concat([dataset,dfr])
        print(dataset.head())

In [None]:
dataset['word'].unique()

In [None]:
dataset.drop_duplicates()