In [1]:
import ijson # $pip install ijson
import pandas as pd
import numpy as np

from nltk import word_tokenize, pos_tag, wordnet
from nltk.stem.wordnet import WordNetLemmatizer

import zenodotus as z
import noun_phrase_extractor as npe

# Importando dados de origem

In [2]:
#Leitura do DataFrame
datapath = "raw.json"

sentences = []

#Subjects
with open(datapath, 'r') as f:
    objects = ijson.items(f, 'topics.item.subject')
    sentences = list(objects)

#Questions
with open(datapath, 'r') as f:
    objects2 = ijson.items(f, 'topics.item.question')
    sentences.extend(list(objects2))

    
source = pd.DataFrame({'sentence':sentences})

In [3]:
#existem dados faltantes?
source[source['sentence']==''].count()

sentence    21
dtype: int64

In [4]:
#removendo...
source = source[source['sentence']!='']
#depois da remoção
source[source['sentence']==''].count()

sentence    0
dtype: int64

In [5]:
#tokenização
source['tokenized'] = source.apply(lambda row: word_tokenize(row['sentence'].lower()), axis=1)

#Faz o tagging
source['tagged'] = source.apply(lambda row: pos_tag(row['tokenized']),axis=1)

#Lematiza somente a o token, mantendo a tag original do token antes da lematização (actors, NNS) -> (actor, NNS)
Lemmatizer = WordNetLemmatizer()
for i in source['tagged'].index:
    lista_aux = []
    for tupla in source['tagged'][i]:
        lista_aux.append( (Lemmatizer.lemmatize(tupla[0]), tupla[1]) )
    source['tagged'][i] = lista_aux

#Usa o noun-phrase extractor
source['noun-phrases'] = source.apply(lambda row: npe.extract(row['sentence']), axis=1)

In [6]:
source.head()

Unnamed: 0,sentence,tokenized,tagged,noun-phrases
0,Bad rap actors good rap actors list?,"[bad, rap, actors, good, rap, actors, list, ?]","[(bad, JJ), (rap, NN), (actor, NNS), (good, JJ...","[[bad, rap, actor, good, rap, actor, list]]"
1,list of hispanic actors?,"[list, of, hispanic, actors, ?]","[(list, NN), (of, IN), (hispanic, JJ), (actor,...","[[list], [hispanic, actor]]"
2,LiST OF ACTORS AND ACTRESSES?,"[list, of, actors, and, actresses, ?]","[(list, NN), (of, IN), (actor, NNS), (and, CC)...","[[list, act, ors and actresses?]]"
3,Who are a and b list actors?,"[who, are, a, and, b, list, actors, ?]","[(who, WP), (are, VBP), (a, DT), (and, CC), (b...","[[list, actor]]"
4,List of famous black actors?,"[list, of, famous, black, actors, ?]","[(list, NN), (of, IN), (famous, JJ), (black, J...","[[list], [famous, black, actor]]"


In [7]:
#Monta um dataset com cada palavra de 'noun-phrases' em uma linha
words = []
for i,row in source.iterrows():
    for phrase in row['noun-phrases']:
        for word in phrase:
            words.append(word)
nellkb = pd.DataFrame({'word':words})

#Removendo repetições
nellkb = nellkb.drop_duplicates()

In [8]:
#Cria uma lista com cada palavra do dataset que é uma categoria da NELL

#Tenta abrir o arquivo com as categorias
try:
    with open('categories.txt', 'r') as filehandle:
        categories = []
        for line in filehandle:
            # Remove '\n', que é o último caracter de cada string
            category = line[:-1]

            # Adiciona o item à lista
            categories.append(category)
        
#Se ele não existe, cria um
except:
    #Procura cada palavra na knowledge base da NELL
    nellkb['is_category'] = nellkb.apply(lambda row: z.isCategory(row['word']),axis=1)
    categories = nellkb[nellkb['is_category'] == 'yes']['word'].tolist()
    #Salva as palavras que são categoria em um arquivo
    with open('categories.txt', 'w') as filehandle:  
        for category in categories:
            filehandle.write('%s\n' %category)

In [35]:
#Cria uma nova coluna 'is_category' no dataset nellkb com 'yes' e 'no'
aux_list = []
for i in nellkb.index:
    if nellkb['word'][i] in categories:
        aux_list.append('yes')
    else:
        aux_list.append('no')

nellkb['is_category'] = aux_list
nellkb.head()

Unnamed: 0,word,is_category
0,bad,no
1,rap,no
2,actor,yes
3,good,no
6,list,no


In [9]:
#
for irow, row in source.iterrows():
    print(source['sentence'][irow])
    #Gerando um dataframe provisório para busca nos dados
    tagged_data_frame = pd.DataFrame(source['tagged'][irow],columns=['word','tag'])
    
    #transformando em minúsculas
    tagged_data_frame['word'] = tagged_data_frame.apply(lambda row: row['word'].lower(),axis=1)
    
    #print(tagged_data_frame.head())
       
    for iphrase, phrase in enumerate(row['noun-phrases']):
        for iword, word in enumerate(phrase):
            print(word in tagged_data_frame['word'])

Bad rap actors good rap actors list?
False
False
False
False
False
False
False
list of hispanic actors?
False
False
False
LiST OF ACTORS AND ACTRESSES?
False
False
False
Who are a and b list actors?
False
False
List of famous black actors?
False
False
False
False
How many nerdy/geeky actors can you list?
False
False
False
False
Can anyone give me a list of good actors?
False
False
False
False
Give me a list of attractive iranian actors.?
False
False
False
False
Can anyone give me a list of actors or directors who have a myspace?
False
False
False
False
False
A-list actors or character actors?
False
False
False
False
List of comic actors?
False
False
False
Can some give me a list of Black Martial artist actors ?
False
False
False
False
False
List some hot young actors?
False
False
False
Cute, young male actor for a character list?
False
False
False
False
False
False
Where can i find a list to identify tv ad actors?
False
False
False
False
MALE ACTORS HOT LIST?
False
False
False
Scottish

In [52]:
dataset = pd.DataFrame()

for word in nellkb['word']:
    for sentence in source['sentence']:
        if word in sentence:

bad
bad
rap
rap
rap
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
actor
ac

In [56]:
nellkb

Unnamed: 0,word,is_category
0,bad,no
1,rap,no
2,actor,yes
3,good,no
6,list,no
8,hispanic,no
11,act,no
12,ors and actresses?,no
16,famous,no
17,black,no


In [None]:
#data = {'sentence':[],'word':[],'word_tag':[],'word_is_category':[]
#                          ,'nell_category':[],'nell_category_tag':[],'distance_from_category':[]
#                         ,'number_of_occurrences':[],'is_category_candidate':[]}
dataset = pd.DataFrame()
for irow, row in source.iterrows():
    for iphrase, phrase in enumerate(row['noun-phrases']):
        print(irow,iphrase,phrase)
        is_category = []
        #Buscando categorias na base da NELL e Gerando o Dataframe
        #for word in phrase:
        #    cat = z.getCategory(word)
        #    #Verifica se a palavra é uma categoria da NELL
        #    if(cat['category_name'] is not None): 
        #        is_category.append('yes')
        #    else:
        #       is_category.append('no')
        df = pd.DataFrame({'word':phrase, 'is_category':is_category})
        words_list = []
        categories = []
        for category in df[df['is_category'] == 'yes']['word'].items():
            for word in df[df['word'] != category[1]]['word'].items():
                words_list.append(word[1])
                categories.append(category[1])
        dfr = pd.DataFrame({'word':words_list, 'category':categories})
        dataset = pd.concat([dataset,dfr])
        print(dataset.head())

In [None]:
dataset

In [None]:
dataset.drop_duplicates()