## __Text mining y Procesamiento de Lenguaje Natural (NLP)__

__Profesor__: Anthony D. Cho

__Tema__: Regla de asociación

__Método__: APRIORI

***

__Dependencias__

```{python}
    python -m pip install nltk spacy
    python -m spacy download en_core_web_sm
    python -m spacy download es_core_news_sm
    
    python -m pip install mlxtend
```

## Librerias

In [None]:
from glob import glob
import re
from pandas import DataFrame

from string import punctuation
from spacy.lang.es.stop_words import STOP_WORDS
from spacy import load

from sklearn.feature_extraction.text import TfidfVectorizer
from mlxtend.frequent_patterns import apriori, association_rules

## Instancia del modelo de lenguaje
nlp = load('es_core_news_sm')

## Carga de documentos

In [None]:
## Encontrar la ruta de cada archivo de interes
path_docs = glob('*/doc*.txt')

## Almacenamiendo de contenido de los documentos e id (nombre del archivo)
corpus, doc_id = [], [] 

## Incio de proceso de carga de documentos
if len(path_docs):
    for file in path_docs:

        ## Se carga el texto
        text = open(file, 'r', encoding='utf-8').read()
        
        ## Se almacena el texto
        corpus.append(text)
        
        id = file.split('\\')[-1].split('.')[0]

        ## Se almacena el id
        doc_id.append(id)
else:
    print('No corpus have found.')

#### Preprocesamiento

Extracción de entidades

In [None]:
## Limpieza de textos
cleanTexts = []

## Pattern extraction
pattern = '(\w+)/(PROPN|NOUN)'

for doc in corpus:

    ## Extraer las entidades
    documento = nlp(doc)
    text_POS = ''.join( f'{word.text}/{word.pos_} ' for word in documento )
    text_POS = text_POS.rstrip()
    word_list = [w for (w,t) in re.findall(pattern=pattern, string=text_POS)]
    doc = ' '.join(word_list)
    
    # ## Remover numeros y puntuaciones
    doc = re.sub(r'[\"\¿\°\d+]', '', doc)
    doc = [s for s in doc if s not in punctuation]
    doc = ''.join(doc)

    ## Normalización y remover stopwords
    documento = nlp(doc.lower())
    tokens = [word.text for word in documento]
    doc = [word for word in tokens if word not in STOP_WORDS]
    doc = ' '.join(doc)
    doc = re.sub(pattern='\s+', repl=' ', string=doc)
    
    ## Aplicar lemmatización
    documento = nlp(doc)
    lemmas = [word.lemma_ for word in documento]
    doc = ' '.join(lemmas)
    doc = re.sub(pattern='\s+', repl=' ', string=doc)

    ## Almacenado de contenido procesado
    cleanTexts.append(doc)

## Mostar contenido procesado
cleanTexts
    

In [None]:
## Instancia del modelo
model = TfidfVectorizer(use_idf=False,  ## <- 
                        norm=None,
                        ngram_range=(1, 1),
                        binary=True     ## <- 
                        )

## Ajuste del modelo y retorno de TF matrix
tf_sparse = model.fit_transform(cleanTexts)

## Extraer Vocabulario creado por el modelo (dict :: key (word), value (index))
vocabulary = model.vocabulary_

In [None]:
features = sorted(vocabulary.items(), key=lambda x: x[1])
features = [f for f, _ in features]

tf_table = DataFrame(tf_sparse.toarray(), columns=features)
tf_table

In [None]:
## Find relevant itemsets
itemsets = apriori(tf_table.iloc[:, :50].astype(bool), 
                   min_support=0.05,
                   low_memory=True, 
                   use_colnames=True)
itemsets

In [None]:
## Find relevant association's rules 
rules = association_rules(itemsets, metric = 'lift', min_threshold=5)
rules

In [None]:
## Sort rules by confidece and lift 
sorted_rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])

In [None]:
## Display sorted rules
sorted_rules[['antecedents', 'consequents', 'confidence', 'lift']]