In [1]:
from Modules.indexer import load_index
from Modules.cleaner import Cleaner
import pandas as pd
import re
from collections import defaultdict
from nltk.tokenize import TweetTokenizer
import numpy as np

In [None]:
INDEX_PATH = '../Data/index/index.json'
index = load_index(INDEX_PATH)

In [None]:
MUSICS_PARSED_PATH = '../Data/music_info/musics_parsed.csv'
musics_parsed = pd.read_csv(MUSICS_PARSED_PATH)

In [None]:
cleaner = Cleaner()

In [None]:
tokenizer = TweetTokenizer(strip_handles=False,reduce_len=False)

In [None]:
def easy_string(string):
    pattern = r'(\(|\))'
    return re.sub(pattern,r' \1 ',string)

In [None]:
def get_dict_and_index_query(x,index):
    if type(x) == str:
        if x in index.keys():
            index_dict = index[x]
            index_query = set(index_dict)
            full_dict_querry = index_dict
        else:
            full_dict_querry = {}
            index_query = set([])
        return full_dict_querry, index_query
    else:
        index_query = set(x)
        return x, index_query

In [None]:
def get_song(music_index,music_parsed):
    return cleaner.clean_and_tokenize(music_parsed.loc[music_index]['Lyric'])
    

In [None]:
def AND_query(index,query):
    
    dict_querry,index_query = get_dict_and_index_query(query[0],index)
    full_dict_querry = [dict_querry]
    
    
    for query_term in query[1:]:
        dict_querry,term_index_query = get_dict_and_index_query(query_term,index)
        index_query &= term_index_query
        full_dict_querry.append(dict_querry)
        
    dict_out = defaultdict(list)
    for doc in index_query:
        for current_dict in full_dict_querry:
            if doc in current_dict.keys():
                dict_out[doc]+=current_dict[doc]
        
    return dict_out


In [None]:
def OR_query(index,query):
    
    dict_querry,index_query = get_dict_and_index_query(query[0],index)
    full_dict_querry = [dict_querry]
    
    
    for query_term in query[1:]:
        dict_querry,term_index_query = get_dict_and_index_query(query_term,index)
        index_query |= term_index_query
        full_dict_querry.append(dict_querry)
        
    dict_out = defaultdict(list)
    for doc in index_query:
        for current_dict in full_dict_querry:
            if doc in current_dict.keys():
                dict_out[doc]+=current_dict[doc]
        
    return dict_out

In [None]:
def NOT_query(index,term):
    
    dict_querry,index_query = get_dict_and_index_query(term,index)
    
    okk = set()
    for i in index.values():
        okk |= set(i)
    
    index_query = okk-index_query
        
    dict_out = defaultdict(list)
    for doc in index_query:
        dict_out[doc] = []
        
    return dict_out

In [None]:
def NEAR_query(index,query):
    
    dict_querry_1,index_query_1 = get_dict_and_index_query(query[0],index)
    dict_querry_2,index_query_2 = get_dict_and_index_query(query[1],index)
    
    index_query_1 &= index_query_2
    
    dict_out = defaultdict(list)
    
    for doc in index_query_1:
        vals_2 = dict_querry_2[doc]
        vals_1 = dict_querry_1[doc]
        for val_1 in vals_1:
            if (val_1+1 in vals_2) or (val_1-1 in vals_2):
                dict_out[doc]= vals_1 + vals_2
                break
        
    return dict_out

In [None]:
def solve_query(index,query):
    query_tokens = query.copy()
    while 'NOT' in query_tokens:
        NOT_index = query_tokens.index('NOT')
        token = query_tokens[NOT_index+1]
        if type(token) == str:
            token = cleaner.clean_and_tokenize(token)
            if len(token) >1:
                token = AND_query(index,token)
            elif len(token) ==0:
                del query_tokens[NOT_index:NOT_index+2]
                pass
        
        parsed_token = NOT_query(index,token)
        
        del query_tokens[NOT_index:NOT_index+2]
        
        query_tokens.insert(NOT_index,parsed_token)
 
    while 'NEAR' in query_tokens:
        NEAR_index = query_tokens.index('NEAR')
        
        token_1 = query_tokens[NEAR_index+1]
        token_2 = query_tokens[NEAR_index-1]
        
        if type(token_1) == str:
                token_1 = cleaner.clean_and_tokenize(token_1)
                if len(token_1)>1:
                    token_1 = AND_query(index,token_1)
                else:
                    token_1 = token_1[0]
        
        if type(token_2) == str:
                token_2 = cleaner.clean_and_tokenize(token_2)
                if len(token_2)>1:
                    token_2 = AND_query(index,token_2)
                else:
                    token_2 = token_2[0]
        
        token = [token_1,token_2]
        
        parsed_token = NEAR_query(index,token)
        
        del query_tokens[NEAR_index-1:NEAR_index+2]
        
        query_tokens.insert(NEAR_index-1,parsed_token)

    while 'OR' in query_tokens:
        OR_index = query_tokens.index('OR')
        
        token_1 = query_tokens[OR_index+1]
        token_2 = query_tokens[OR_index-1]
        
        if type(token_1) == str:
                token_1 = cleaner.clean_and_tokenize(token_1)
                if len(token_1)>1:
                    token_1 = AND_query(index,token_1)
                else:
                    token_1 = token_1[0]
        
        if type(token_2) == str:
                token_2 = cleaner.clean_and_tokenize(token_2)
                if len(token_2)>1:
                    token_2 = AND_query(index,token_2)
                else:
                    token_2 = token_2[0]
        
        token = [token_1,token_2]
        
        parsed_token = OR_query(index,token)
        
        del query_tokens[OR_index-1:OR_index+2]
        
        query_tokens.insert(OR_index-1,parsed_token)
    
    if len(query_tokens)>1:
        final_query = []
        
        for token in query_tokens:
            if type(token) == str:
                final_query += cleaner.clean_and_tokenize(token)
            else:
                final_query.append(token)
        
        query_tokens = AND_query(index,final_query)
        
    if type(query_tokens) == list:
        query_tokens = query_tokens[0]
        
    return query_tokens

In [None]:
def parse_query(query):
    query = easy_string(query)
    query = tokenizer.tokenize(query)
    
    lista = query.copy()
    aux = 0
    pos_i = 0
    
    while True:
        char = lista[aux]
        if char == '(':
            pos_i = aux
        if char == ')':
            result = solve_query(index,lista[pos_i+1:aux])
            del lista[pos_i:aux+1]
            
            lista.insert(pos_i,result)
            
            aux = 0
        aux+=1
        if aux== len(lista):
            break
    
    if len(lista)>1:
        lista = solve_query(index,lista)
    return lista

In [None]:
teste = "NOT 30$40 OR NOT 70 oi NEAR linda"
teste_2 = "(Moça NEAR bonita) OR R7"

In [None]:
result = parse_query(teste_2)

In [None]:
get_song(139608,musics_parsed)

In [None]:
def tf_idf(doc_number,word_info,N):
    if doc_number in word_info.keys():
        fij = len(word_info[doc_number])
    else:
        fij = 0
        
    ni = len(word_info.keys())
    
    return np.log2(1 + fij)*np.log2(N/ni)

In [None]:
tf_idf('2465',index['oi'],len(musics_parsed))