A través de esta notebook, se realizó el preprocesado de NLTK guardando los archivos para optimizar el tiempo de ejecución al ejecutar el algoritmo de Multinomial Naive Bayes.

#Librerías + montar drive + dataset original

In [None]:
 pip install dask[dataframe] --upgrade

Collecting fsspec>=0.6.0
  Downloading fsspec-2021.10.0-py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 5.1 MB/s 
[?25hCollecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting locket
  Downloading locket-0.2.1-py2.py3-none-any.whl (4.1 kB)
Installing collected packages: locket, partd, fsspec
Successfully installed fsspec-2021.10.0 locket-0.2.1 partd-1.2.0


In [None]:
import os
import time
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import dask.dataframe as dd
#Para el preprocesado
import nltk
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus   import stopwords
from nltk.tokenize import TreebankWordTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Descargamos los requerimientos para utlizar las funciones
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Cambie el path donde se encuentran descargado los archivos:

In [None]:
# Cargo los datos
path = "/content/drive/MyDrive/Redes TPS/TP Redes 1/"
df_train = pd.read_hdf(path+"train_data.hdf5")
df_valid = pd.read_hdf(path+"valid_data.hdf5")

#Funciones

In [None]:
#Inicializamos el PorterStemmer y WordnetLemmatizer
tokenizer  = TreebankWordTokenizer()
stemmer    = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
def nltk_preprocessor_callback(**kwargs):
    """ kwargs -> hp
        Preprocesamiento con NLTK igual que en la clase anterior """

    def preprocessor(datapoint):
        raw_datapoint          = datapoint
        tokenized_datapoint    = tokenizer.tokenize(str(raw_datapoint))

        # Decide if we are going to lemmatize our data
        if kwargs.setdefault('is_lem', True):
            lemmatized_datapoint   = [lemmatizer.lemmatize(x,pos='v') for x in tokenized_datapoint]
        else:
            lemmatized_datapoint   = tokenized_datapoint

        # Decide if we are going to remove stopwords our data, kwargs -> hp
        if kwargs.setdefault('is_stop', True):
            nonstop_datapoint      = [x for x in lemmatized_datapoint if x not in stopwords.words('english')]
        else:
            nonstop_datapoint      = lemmatized_datapoint

        # Decide if we are going to apply stemming to our data, kwargs -> hp
        if kwargs.setdefault('is_stem', True):
            stemmed_datapoint      = [stemmer.stem(x) for x in nonstop_datapoint]
            filtered_datapoint     = stemmed_datapoint
        else:
            filtered_datapoint     = nonstop_datapoint
        
        # Skip this if not applying alpha
        if kwargs.setdefault('is_alpha', True):
            alphanumeric_datapoint = [x for x in filtered_datapoint if x.isalpha()]
            filtered_datapoint     = alphanumeric_datapoint

        return ' '.join(filtered_datapoint)

    return preprocessor

In [None]:
def run_nltk_preprocessor(hp, dataset=None):
    print('NLTK Preprocessing...')
    to = time.time()
    data = pd.DataFrame()
    preprocessor = nltk_preprocessor_callback(
            is_lem=hp['is_lem'],
            is_stop=hp['is_stop'],
            is_stem=hp['is_stem'],
            is_alpha=hp['is_alpha']
            )
    ddataset = dd.from_pandas(dataset, npartitions=os.cpu_count())
    data['text'] = ddataset['text'].map_partitions(lambda df: df.apply(preprocessor)).compute(scheduler='multiprocessing')
    tf = time.time()
    print('finished in', (int(tf-to)), 'seconds.')
    return data

#Procesamiento

En esta notebook se realizó el preprocesado por NLTK de cada una de las siguiente opciones:

In [None]:
#Definimos los hiperparámetros que vamos a utilizar en las transformaciones con NLTK
hiperparameters = {
    'is_lem':       [True, False], #si usa o no lematización
    'is_stop':      [True, False],  #si elimina o no stopwords
    'is_stem':      [True, False],  #si realizar o no la stemización
    'is_alpha':     [True, False],  #si elimina valores no alfabeticos
    'is_lowerc':    [True, False],  #si pasa todo a minúscula
}

In [None]:
reg = []
#lo convertimos en un dataframe, primero apilando las opciones en una lista
for lem_b in hiperparameters['is_lem']:
  for stop_b in hiperparameters['is_stop']:
    for stem_b in hiperparameters['is_stem']:
      for alpha_b in hiperparameters['is_alpha']:
        for lower_b in hiperparameters['is_lowerc']:
             reg.append([lem_b,stop_b,stem_b,alpha_b,lower_b])

hp = pd.DataFrame(reg, columns =['is_lem', 'is_stop','is_stem','is_alpha','is_lowercase'])
hp.head(5) #son 32 opciones

Unnamed: 0,is_lem,is_stop,is_stem,is_alpha,is_lowercase
0,True,True,True,True,True
1,True,True,True,True,False
2,True,True,True,False,True
3,True,True,True,False,False
4,True,True,False,True,True


In [None]:
path = '/content/drive/MyDrive/Redes TPS/TP Redes 1/NLTK data/'

In [None]:
names_train=[]
names_valid=[]
for idx,hyperParam in hp.iterrows():
  name='X_train_NLTK'
  name2='X_valid_NLTK'
  if hyperParam['is_lem']==True:
    name=name+'_lem'
    name2=name2+'_lem'
  if hyperParam['is_stop']==True:
    name=name+'_stop'
    name2=name2+'_stop'
  if hyperParam['is_stem']==True:
    name=name+'_stem'
    name2=name2+'_stem'
  if hyperParam['is_alpha']==True:
    name=name+'_alpha'
    name2=name2+'_alpha'
  if hyperParam['is_lowercase']==True:
    name=name+'_lowerc'
    name2=name2+'_lowerc'
  name=name+'.csv'
  name2=name2+'.csv'
  names_train.append(name)
  names_valid.append(name2)
  #data_train = run_nltk_preprocessor(hyperParam,df_train)
  #data_valid = run_nltk_preprocessor(hyperParam,df_valid)
  #data_train.to_json(path+name)
  #data_valid.to_json(path+name2)

NLTK Preprocessing...
finished in 649 seconds.
NLTK Preprocessing...
finished in 11 seconds.
NLTK Preprocessing...
finished in 647 seconds.
NLTK Preprocessing...
finished in 11 seconds.
NLTK Preprocessing...
finished in 647 seconds.
NLTK Preprocessing...
finished in 11 seconds.
NLTK Preprocessing...
finished in 648 seconds.
NLTK Preprocessing...
finished in 11 seconds.
NLTK Preprocessing...
finished in 584 seconds.
NLTK Preprocessing...
finished in 10 seconds.
NLTK Preprocessing...
finished in 585 seconds.
NLTK Preprocessing...
finished in 10 seconds.
NLTK Preprocessing...
finished in 583 seconds.
NLTK Preprocessing...
finished in 10 seconds.
NLTK Preprocessing...
finished in 585 seconds.
NLTK Preprocessing...
finished in 10 seconds.
NLTK Preprocessing...
finished in 142 seconds.
NLTK Preprocessing...
finished in 2 seconds.
NLTK Preprocessing...
finished in 142 seconds.
NLTK Preprocessing...
finished in 2 seconds.
NLTK Preprocessing...
finished in 141 seconds.
NLTK Preprocessing...
fin

In [None]:
hp['X_train']=names_train
hp['X_valid']=names_valid
hp

Unnamed: 0,is_lem,is_stop,is_stem,is_alpha,is_lowercase,X_train,X_valid
0,True,True,True,True,True,X_train_NLTK_lem_stop_stem_alpha_lowerc.csv,X_valid_NLTK_lem_stop_stem_alpha_lowerc.csv
1,True,True,True,True,False,X_train_NLTK_lem_stop_stem_alpha.csv,X_valid_NLTK_lem_stop_stem_alpha.csv
2,True,True,True,False,True,X_train_NLTK_lem_stop_stem_lowerc.csv,X_valid_NLTK_lem_stop_stem_lowerc.csv
3,True,True,True,False,False,X_train_NLTK_lem_stop_stem.csv,X_valid_NLTK_lem_stop_stem.csv
4,True,True,False,True,True,X_train_NLTK_lem_stop_alpha_lowerc.csv,X_valid_NLTK_lem_stop_alpha_lowerc.csv
5,True,True,False,True,False,X_train_NLTK_lem_stop_alpha.csv,X_valid_NLTK_lem_stop_alpha.csv
6,True,True,False,False,True,X_train_NLTK_lem_stop_lowerc.csv,X_valid_NLTK_lem_stop_lowerc.csv
7,True,True,False,False,False,X_train_NLTK_lem_stop.csv,X_valid_NLTK_lem_stop.csv
8,True,False,True,True,True,X_train_NLTK_lem_stem_alpha_lowerc.csv,X_valid_NLTK_lem_stem_alpha_lowerc.csv
9,True,False,True,True,False,X_train_NLTK_lem_stem_alpha.csv,X_valid_NLTK_lem_stem_alpha.csv


In [None]:
hp.to_csv(path+'get_namefile.csv')