In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')
from tqdm.notebook import trange, tqdm
import os

In [2]:
import torch
if torch.cuda.is_available():
  torch.device('cuda')

In [3]:
from torch import nn

### Gather Data

In [4]:
# %%capture output
!unzip -o 'drive/My Drive/Colab Notebooks/nlp-sadosky/data.zip' -d ./

Archive:  drive/My Drive/Colab Notebooks/nlp-sadosky/data.zip
   creating: ./data/
   creating: ./data/raw/
  inflating: ./data/raw/test_santander.csv  
  inflating: ./data/raw/train.csv    
  inflating: ./data/raw/first_submit_santander.csv  
  inflating: ./data/raw/.gitkeep     
   creating: ./data/external/
  inflating: ./data/external/answers.csv  
  inflating: ./data/external/CREA_total.zip  
  inflating: ./data/external/.gitkeep  
  inflating: ./data/external/stopwords.txt  
  inflating: ./data/external/CREA_total.TXT  
  inflating: ./data/external/dict.json  
   creating: ./data/processed/
  inflating: ./data/processed/.gitkeep  
   creating: ./data/interim/
  inflating: ./data/interim/.gitkeep  
  inflating: ./data/interim/dictionary.gz  
   creating: ./data/results/
   creating: ./data/.ipynb_checkpoints/


In [25]:
DATA_DIR ='./data'
RAW_DIR =  os.path.join(DATA_DIR, 'raw/')
EXTERNAL_DIR = os.path.join(DATA_DIR, 'external/')
TRAIN_DATA = os.path.join(RAW_DIR, 'train.csv')
TEST_DATA = os.path.join(RAW_DIR, 'test_santander.csv')
STOPWORDS_DIR = os.path.join(EXTERNAL_DIR, 'stopwords.txt')

## Start Learning


### Explore dataset
If you explore the ./data/raw/... folder you will see 2 new files: 
- train.csv
- test_santander.csv


In [26]:
# pregunta:
# categoria:

columns = ["Pregunta", "Intencion"]

In [27]:
!ls $TRAIN_DATA

./data/raw/train.csv


In [28]:
data = pd.read_csv(TRAIN_DATA, encoding='utf-8', sep='|')

# data.columns = columns
data.sample(5)

Unnamed: 0,Pregunta,Intencion
13604,fondo insuficiente en la tarjeta,Cat_303
13600,me quiero traspasar de sucursal,Cat_65
1658,caja ahorro costo mensual,Cat_96
5649,beneficio pintureria,Cat_311
1410,como puedo solicitar un creditos,Cat_223


In [29]:
test = pd.read_csv(TEST_DATA)
test

Unnamed: 0,id,Pregunta
0,0,querer saber tarjeta sin limite
1,1,¿cuál es el límite de mi tarjeta santander?
2,2,hay beneficios en restaurantes de la costa atl...
3,3,semana realizar pagar afip monotributo volver ...
4,4,por un prestamo de mil. cuanto es el interes?
...,...,...
6697,6697,denunciar un cobro de tarjeta de credito
6698,6698,quiero pagar deuda refinanciada
6699,6699,quiero pagar de mi open credit un poquito mas ...
6700,6700,nesecito imprimir mi resumen tarjeta de credit...


In [30]:
# import unidecode
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, WordPunctTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer , TreebankWordTokenizer
import re
from toolz import pipe
from functools import partial


# def get_stopwords(path = '../../data/external/stopwords.txt'):
#     """
#      Read stopwords in spanish
#     """
#     stop_arr = stopwords.words('spanish')

#     with open(path) as f:
#         for line in f:
#             stop_arr.append(unidecode.unidecode(line.strip()))

#     stop_arr = sorted(list(set(stop_arr)))

#     return stop_arr

class Cleaner():
    def __init__(self):
        # nltk.download('punkt')
        self.tk = TreebankWordTokenizer()
        self.dtk = TreebankWordDetokenizer()
        self.BAD_CAT_REMOVE = re.compile('^Cat_')
        self.A_TILDE_REMOVE = re.compile('[á]')
        self.E_TILDE_REMOVE = re.compile('[é]')
        self.I_TILDE_REMOVE = re.compile('[í]')
        self.O_TILDE_REMOVE = re.compile('[ó]')
        self.U_TILDE_REMOVE = re.compile('[ú]')
        self.POINT_FOLLOWING_LETTER=re.compile('(?<=\S)\.(?=\w)')
        self.BAD_SYMBOLS_REMOVE = re.compile('[^A-Za-z0-9_ áéíóú]')
    
    def applyRegex(self, value,regex,replacement):
        value = regex.sub(replacement,value)
        return value

    def text_cleaning(self, text):
        return pipe(
            text.lower(),
            # partial(self.BAD_SYMBOLS_REMOVE.sub,  ''), 
            partial(self.A_TILDE_REMOVE.sub, 'a'), 
            partial(self.E_TILDE_REMOVE.sub, 'e'), 
            partial(self.I_TILDE_REMOVE.sub, 'i'), 
            partial(self.O_TILDE_REMOVE.sub, 'o'), 
            partial(self.U_TILDE_REMOVE.sub, 'u'), 
            # partial(self.POINT_FOLLOWING_LETTER.sub('. '))
        )

    def sentence_cleaning(self, sentence, detokenize=False):
        word_tokens = pipe(
            sentence, 
            partial(self.POINT_FOLLOWING_LETTER.sub, '. '),
            self.tk.tokenize
        )       


        word_tokens = [self.text_cleaning(text) for text in word_tokens]
        # word_tokens.remove('')

        if detokenize:
            return self.dtk.detokenize(word_tokens)
        else: 
            return word_tokens
        

In [31]:
!pip install pyspellchecker -U

Requirement already up-to-date: pyspellchecker in /usr/local/lib/python3.6/dist-packages (0.5.4)


In [32]:
from tqdm.notebook import tqdm

In [33]:
from spellchecker import SpellChecker

spell = SpellChecker()
spell.word_frequency.load_dictionary('./data/interim/dictionary.gz')

def replace_unknown(r):
    misspelling = spell.unknown(cleaner.sentence_cleaning(r))
    corrections = [ spell.correction(i) for i in misspelling] 
    text = r
    for m, c in zip(misspelling, corrections):
        text = text.replace(m, c)
    return text
    # return cleaner.sentence_cleaning(r, detokenize=True)

def tqdemizado(s, pbar):
    pbar.update(1)
    return replace_unknown(s)


In [34]:
cleaner = Cleaner()
pbar = tqdm(total=len(data.Pregunta))
data["Pregunta"] = data.Pregunta.apply(lambda s: tqdemizado(s, pbar))
pbar.close()

HBox(children=(FloatProgress(value=0.0, max=20104.0), HTML(value='')))




In [35]:
cleaner = Cleaner()
pbar = tqdm(total=len(test.Pregunta))
test["Pregunta"] = test.Pregunta.apply(lambda s: tqdemizado(s, pbar))
pbar.close()

HBox(children=(FloatProgress(value=0.0, max=6702.0), HTML(value='')))




In [37]:
data.to_csv('drive/My Drive/Colab Notebooks/nlp-sadosky/train_cleaned.csv', index=False)

In [38]:
test.to_csv('drive/My Drive/Colab Notebooks/nlp-sadosky/test_cleaned.csv', index=False)