# Programa com Funcoes

## Caminhos

In [1]:
import os

#Folder Inicial
path = os.getcwd()

#Subpastas
pathin = path + '\\Entrada\\'
pathfixo = path + '\\Fixo\\'
pathout = path + '\\Saida\\'
pathparcial = path + '\\Parcial\\'
pathaux = path + '\\Auxiliar\\'

## Pacotes

In [2]:
import dill
import pandas as pd
import numpy as np

import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

from time import gmtime, strftime

import re

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

import unidecode
from unicodedata import normalize

import pygtrie

## Funcoes "Fixas"

In [3]:
########################################################################################
#Removendo acentos
########################################################################################

rem_acentos = lambda x: normalize('NFKD', x).encode('ASCII', 'ignore').decode('ASCII')

########################################################################################
#Funcao para otimizar Stem
########################################################################################

def Tokenize(sentence):
    sentence = sentence.lower()
    sentence = nltk.word_tokenizer(sentence)
    return sentence

def Stemming(sentence):
    stemmer = RSLPStemmer()
    phrase = []
    
    for word in sentence:
            phrase.append(stemmer.stem(word.lower()))
    
    return phrase

########
# Stem #
########
def stem(col):
    
    #Substitui oes por ao (por causa de palavras: cartoes para cartao, por exemplo)
    #Isso porque removemos os acentos antes
    col = col.apply(lambda x: re.sub(r'oes\b', 'ao', x))
    
    #Tokenize
    col = col. apply(lambda x: Tokenize(x))
    
    #Stem
    col = col.apply(lambda x: Stemming(x))
    
    #Back to Sentence
    col = col.apply(lambda L: "".join(str(x) for x in L))
    
    return col

########################################################################################
#Removendo StoWords com base em uma arvore
########################################################################################

def limpa_nomes(text, wordsremove):
    
    tree = pygtrie.StringTrie()
    
    #Para cada palavra a ser removida criar um par com:
    #a palavra que deve ser substituida e o valor a substituir
    for word in wordsremove:
        tree[word] = ''
        
    string = ''
    for token in text.split():
        if token not in tree:
            string = string + token + ''
            
    return string

## Funcao de Preprocessamento (mudar conforme a base de dados)

In [4]:
def PreProcess(data, textcol, maxchar, minchar, wordstoremove):
    
    #sw = StopWords('portuguese')
    #sw = StopWords('english')
    
    #Transformando tudo em minuscula e removendo acento
    data[textcol] = data[textcol].apply(lambda x: x.lower()).apply(rem_acentos)
       
    print('Fim da Remocao de Acentos')
    print(strftime('%Y-%m-%d %H:%M:%S', gmtime()))
    
    #Tirando Mascara (caso tenhha), cumprimentos, e expressoes que atrapalham (bom dia, sendo assim)
    #data[textcol] = data[textcol].apply(lambda x: re.sub(r'<div>', ' ', x))
    
    #Removendo nomes
    data[textcol] = data[textcol].apply(lambda x: limpa_nomes(x, wordstoremove))
    
    print('Fim da Remocao de Nomes')
    print(strftime('%Y-%m-%d %H:%M:%S', gmtime()))
    
    #Removendo \n e \t
    vec = [r'\n', r'\t']
    for i in range(0, len(vec)):
        data[textcol] = data[textcol].apply(lambda x: x.replace(vec[i], ''))
        
    #Removendo pontuacao, numeros.. ficam apenas as letras minusculas
    data[textcol] = data[textcol].apply(lambda x: re.sub(r'[^a-z]', ' ', x))
    
    #Caso queira pegar o texto depois de um certo ponto ou ate um certo ponto
    #sep = 'Mascara Inicio'
    #data[textcol] = data[textcol].apply(lambda x: x.split(sep, 1)[-1])
    #sep = 'Mascara Fim'
    #data[textcol] = data[textcol].apply(lambda x: x.split(sep, 1)[0])
    
    #Removendo os @User
    data[textcol] = data[textcol].apply(lambda txt: ' '.join(word for word in txt.split(' ') if not word.startswith('@')))   
    
    #Removendo palavras com poucas e muitas letras
    data[textcol] = data[textcol].apply(lambda x: re.sub(r'\b[a-z]{1,}' + str(minchar - 1) + r'}\b', ' ', x))
    data[textcol] = data[textcol].apply(lambda x: re.sub(r'\b[a-z]{' + str(maxchar + 1) + r',300}\b', ' ', x))
      
    #Tirar espacos duplos e espacos de comeco e fim de celula
    data[textcol] = data[textcol].apply(lambda x: re.sub(r' +', ' ', x))
    data[textcol] = data[textcol].apply(lambda x: x.strip())
    
    print('Fim da Remocao de Palavras Curtas e Longas')
    print(strftime('%Y-%m-%d %H:%M:%S', gmtime()))
    
    ####################################################################################
    #Criacao de Variaveis
    ####################################################################################
    
    #########################################
    #Criacao de Variaveis: Tamanho de texto
    #########################################
    
    #Total de Caracteres
    data['length'] = data[textcol].apply(lambda x: len(x))
    #Total de Palavras
    data['words'] = data[textcol].apply(lambda x: len(x.split(' ')))
    
    #Numero Medio de Caracters das palavras
    data['avg_word_length'] = data[textcol].apply(lambda x: np.mean([len(t) for t in x.split(' ')]))

    #Numero Minimo de Caracters das palavras
    data['min_word_length'] = data[textcol].apply(lambda x: np.min([len(t) for t in x.split(' ')]))
     
    #Numero Maximo de Caracters das palavras
    data['max_word_length'] = data[textcol].apply(lambda x: np.max([len(t) for t in x.split(' ')]))
                                                      
    print('Fim da Contagem de Palavras')
    print(strftime('%Y-%m-%d %H:%M:%S', gmtime()))
                                                  
    #########################################
    #Criacao de Variaveis: O que contem
    #########################################
    
    #words0 = ['ccf', 'oi']
    #words1 = '|'.join(words0)
    #data['DummyCCF'] = np.where(data[textcol].str.contains(words1, na = False, case = True), 1, 0)
                      
    ####################################################################################
    #Stem
    ####################################################################################
                      
    #data[textcol] = stem(data[textcol])

    ####################################################################################
    #Excluindo variavel categorica
    ####################################################################################
    
    #data = data.drop(['categorica'], axis = 1)
                      
    return data

## Scale das Variaveis

In [5]:
def dicMeanSd(data, variables):
    
    dic = {}

    for v in variables:

        if bool(dic) == False:
            dic = {'mean_' + v: data[v].mean()}
        if bool(dic) == True:
            dic['mean_' + v] = data[v].mean()

        dic['sd_' + v] = data[v].std()

    return dic

In [6]:
def Scale(data, variables, dic):
    for x in variables:
        data[x] = (data[x] - dic['mean_' + x]) / dic['sd_' + x]
    return data

## Lista de Nomes, Averbios, Palavras a Retirar

In [7]:
#Nomes
nomes = pd.read_csv(pathfixo + 'Nomes IBGE.csv', encoding = 'latin1')
nomes2 = nomes['Nomes'].apply(lambda x: x.lower()).apply(rem_acentos).unique()

#Nomes
sobrenomes = pd.read_csv(pathfixo + 'Sobrenomes.csv', encoding = 'latin1')
sobrenomes2 = sobrenomes['Sobrenomes'].apply(lambda x: x.lower()).apply(rem_acentos).unique()

#Adverbios
adverbios = pd.read_csv(pathfixo + 'Adverbios.csv', encoding = 'latin1')
adverbios2 = adverbios['Adverbios'].apply(lambda x: x.lower()).apply(rem_acentos).unique()

#Palavras adicionais
outras = pd.read_csv(pathfixo + 'Outras.csv', encoding = 'latin1')
outras2 = outras['Palavras'].apply(lambda x: x.lower()).apply(rem_acentos).unique()

#Stopwords
#stop = pd.DataFrame({'stop': stopwords.words('portuguese')})
stop = pd.DataFrame({'stop': stopwords.words('english')})

#Nomes e Palavras
NomesEPalavras = np.append(stop, adverbios2)
NomesEPalavras = np.append(NomesEPalavras, outras2)
#NomesEPalavras = np.append(NomesEPalavras, nomes2)
#NomesEPalavras = np.append(NomesEPalavras, sobrenomes2)

NomesEPalavras = np.unique(NomesEPalavras)

## Salvando

In [8]:
with open(pathaux + 'Functions.pickle', 'wb') as f:
    dill.dump((rem_acentos, stem, limpa_nomes, PreProcess, dicMeanSd, Scale, NomesEPalavras), f)