In [1]:
import ijson # $pip install ijson
import pandas as pd
import numpy as np

from nltk import word_tokenize, pos_tag, wordnet
from nltk.stem.snowball import SnowballStemmer

import zenodotus as z
import noun_phrase_extractor as npe

# Importando dados de origem

In [2]:
#Leitura do DataFrame
datapath = "raw.json"

sentences = []

with open(datapath, 'r') as f:
    objects = ijson.items(f, 'topics.item.subject') #Pulo do gato: item:são os elementos de uma lista
    sentences = list(objects)

    
with open(datapath, 'r') as f:
    objects2 = ijson.items(f, 'topics.item.question')
    sentences.extend(list(objects2))

    
source  = pd.DataFrame({'sentence':sentences})

In [3]:
source.head()

Unnamed: 0,sentence
0,Bad rap actors good rap actors list?
1,list of hispanic actors?
2,LiST OF ACTORS AND ACTRESSES?
3,Who are a and b list actors?
4,List of famous black actors?


In [4]:
source.iloc[201,0]

'5 feet or under. Good Actors, like Danny Divido.'

In [5]:
#existem dados faltantes?:
source[source['sentence']=='']

Unnamed: 0,sentence
6,
62,
126,
133,
142,
147,
148,
166,
167,
168,


In [6]:
#removendo...
source = source[source['sentence']!='']

In [7]:
#depois da remoção
source[source['sentence']=='']

Unnamed: 0,sentence


In [8]:
#tokenização
source['tokenized'] = source.apply(lambda row: word_tokenize(row['sentence']), axis=1)

In [9]:
source.head()

Unnamed: 0,sentence,tokenized
0,Bad rap actors good rap actors list?,"[Bad, rap, actors, good, rap, actors, list, ?]"
1,list of hispanic actors?,"[list, of, hispanic, actors, ?]"
2,LiST OF ACTORS AND ACTRESSES?,"[LiST, OF, ACTORS, AND, ACTRESSES, ?]"
3,Who are a and b list actors?,"[Who, are, a, and, b, list, actors, ?]"
4,List of famous black actors?,"[List, of, famous, black, actors, ?]"


In [10]:
#POS Tagging
source['tagged'] = source.apply(lambda row: pos_tag(row['tokenized']),axis=1)

In [11]:
source.head()

Unnamed: 0,sentence,tokenized,tagged
0,Bad rap actors good rap actors list?,"[Bad, rap, actors, good, rap, actors, list, ?]","[(Bad, NNP), (rap, NN), (actors, NNS), (good, ..."
1,list of hispanic actors?,"[list, of, hispanic, actors, ?]","[(list, NN), (of, IN), (hispanic, JJ), (actors..."
2,LiST OF ACTORS AND ACTRESSES?,"[LiST, OF, ACTORS, AND, ACTRESSES, ?]","[(LiST, NN), (OF, IN), (ACTORS, NNP), (AND, NN..."
3,Who are a and b list actors?,"[Who, are, a, and, b, list, actors, ?]","[(Who, WP), (are, VBP), (a, DT), (and, CC), (b..."
4,List of famous black actors?,"[List, of, famous, black, actors, ?]","[(List, NN), (of, IN), (famous, JJ), (black, J..."


In [12]:
#Extração de noun-phrases
source['noun-phrases'] = source.apply(lambda row: npe.extract_cleanned(row['sentence']),axis=1)

In [13]:
source.head()

Unnamed: 0,sentence,tokenized,tagged,noun-phrases
0,Bad rap actors good rap actors list?,"[Bad, rap, actors, good, rap, actors, list, ?]","[(Bad, NNP), (rap, NN), (actors, NNS), (good, ...","[[bad, rap, actor, good, rap, actor, list]]"
1,list of hispanic actors?,"[list, of, hispanic, actors, ?]","[(list, NN), (of, IN), (hispanic, JJ), (actors...","[[hispanic, actor]]"
2,LiST OF ACTORS AND ACTRESSES?,"[LiST, OF, ACTORS, AND, ACTRESSES, ?]","[(LiST, NN), (OF, IN), (ACTORS, NNP), (AND, NN...","[[list, act, ors and actresses?]]"
3,Who are a and b list actors?,"[Who, are, a, and, b, list, actors, ?]","[(Who, WP), (are, VBP), (a, DT), (and, CC), (b...","[[list, actor]]"
4,List of famous black actors?,"[List, of, famous, black, actors, ?]","[(List, NN), (of, IN), (famous, JJ), (black, J...","[[famous, black, actor]]"


In [14]:
words = []
for i,row in source.iterrows():
    for phrase in row['noun-phrases']:
        #print(i, phrase)
        for word in phrase:
            words.append(word)
nellkb = pd.DataFrame({'word':words})

In [15]:
nellkb = nellkb.drop_duplicates()

In [18]:
nellkb['is_category'] = nellkb.apply(lambda row: z.isCategory(row['word']),axis=1)

In [41]:
categories = nellkb[nellkb['is_category'] == 'yes']['word'].tolist()
'actor' in categories

True

In [None]:
#
for irow, row in source.iterrows():
    print(source['sentence'][irow])
    #Gerando um dataframe provisório para busca nos dados
    tagged_data_frame = pd.DataFrame(source['tagged'][irow],columns=['word','tag'])
    
    #TODO: Lematizar
    #...
    #transformando em minúsculas
    tagged_data_frame['word'] = tagged_data_frame.apply(lambda row: row['word'].lower(),axis=1)
    
    #print(tagged_data_frame.head())
       
    for iphrase, phrase in enumerate(row['noun-phrases']):
        for iword, word in enumerate(phrase):
            print(word in tagged_data_frame['word'])

In [None]:
#data = {'sentence':[],'word':[],'word_tag':[],'word_is_category':[]
#                          ,'nell_category':[],'nell_category_tag':[],'distance_from_category':[]
#                         ,'number_of_occurrences':[],'is_category_candidate':[]}
dataset = pd.DataFrame()
print(dataset.head());
for irow, row in source.iterrows():
    for iphrase, phrase in enumerate(row['noun-phrases']):
        print(irow,iphrase,phrase)
        is_category = []
        #Buscando categorias na base da NELL e Gerando o Dataframe
        for word in phrase:
            cat = z.getCategory(word)
            #Verifica se a palavra é uma categoria da NELL
            if(cat['category_name'] is not None): 
                is_category.append('yes')
            else:
                is_category.append('no')
        df = pd.DataFrame({'word':phrase, 'is_category':is_category})
        words_list = []
        categories = []
        for category in df[df['is_category'] == 'yes']['word'].items():
            for word in df[df['word'] != category[1]]['word'].items():
                words_list.append(word[1])
                categories.append(category[1])
        dfr = pd.DataFrame({'word':words_list, 'category':categories})
        dataset = pd.concat([dataset,dfr])
        print(dataset.head())

In [None]:
dataset['word'].unique()

In [None]:
dataset.drop_duplicates()