In [1]:
import numpy as np
import pandas as pd
import re
import string

from pathlib import Path
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import ParseError
from zipfile import ZipFile

In [75]:
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

In [68]:

class StringReplacePunctuation(BaseEstimator, TransformerMixin):
    
    def __init__(self, replace_str = ' '):
        self.replace_str = replace_str
        self.punctuation = string.punctuation
        self.to_remove = 'ªº°'
        self.translation = str.maketrans(self.punctuation, len(self.punctuation) * replace_str, self.to_remove)
        
    def _replace_func(self, text : str):
        return text.translate(self.translation)
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        return [self._replace_func(t) for t in X]

class RegexSubstitution(BaseEstimator, TransformerMixin):
    
    def __init__(self, old_pattern, new_pattern):
        self.old_pattern = old_pattern
        self.new_pattern = new_pattern
        
    def fit(self, X):
        return self
    
    def transform(self, X):
        old_pattern = self.old_pattern
        new_pattern = self.new_pattern
        _function = lambda text : re.sub(old_pattern, new_pattern, text)
        
        return [_function(t) for t in X]
        

In [22]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
                    
                    .apply(lambda t : re.sub('\s+pct\s{0,}', ' pacote ', t) )
                    .apply(lambda t : re.sub('\s+cx\s{0,}', ' caixa ', t) )
                    .apply(lambda t : re.sub('\s+uni\s{1,}|\s+und\s{0,}|\s+unds\s{0,}', ' unidade ', t) )
                    .apply(lambda t : re.sub('\s+c/', ' com', t) )
                    .apply(lambda t : re.sub('\s+p/', ' para', t) )
                    .apply(lambda t : re.sub('(\d{1,})\s{0,}litros{0,}|(\d{1,})\s{0,}lts{0,}', r'\1_litro', t))
                    .apply(lambda t : re.sub('(\d{1,})\s{0,}l', r'\1_litro', t))
                    .apply(lambda t : re.sub('(\d{1,})\s{0,}gramas{0,}|(\d{1,})\s{0,}gr\s{0,}', r'\1_grama', t))
                    .apply(lambda t : re.sub('(\d+)\s{0,}grs\s{0,}', r'\1_grama', t))
                    .apply(lambda t : re.sub('(\d{1,})\s{0,}kg|(\d{1,})\s{0,}kgs', r'\1_quilo', t))
                    .apply(lambda t : re.sub('(\d{1,})\s{0,}ml ', r'\1_ml', t) )
                    .apply(lambda t : re.sub('^\d+', '', t))
                    .apply(lambda t : re.sub('\s+\d+$', ' codigo', t) )

In [79]:
pipe = make_pipeline(
    StringReplacePunctuation(),
    RegexSubstitution('\s+', ' '),
    RegexSubstitution('\s+c/', ' com '),
    RegexSubstitution('\s+p/', ' para '),
    RegexSubstitution('^\d+', ''),
    RegexSubstitution('\s+\d+$', ''),
    RegexSubstitution('\s+pcts?', ' pacote '),
    RegexSubstitution('\s+cxs?', ' caixa '),
    RegexSubstitution('\s+unds?\s*', ' unidade '),
    RegexSubstitution(r'(\d+)\s*(lts?)', '\g<1>_litro'),
    RegexSubstitution(r'(\d+)\s*(grs?)', '\g<1>_grama'),
    RegexSubstitution(r'(\d+)\s*(kgs?)', '\g<1>_quilo'),
    RegexSubstitution(r'(\d+)\s*(ml)', '\g<1>_ml'),
    FunctionTransformer(lambda X : [str.lower(t) for t in X ]),
    FunctionTransformer(lambda X : [str.strip(t) for t in X ])
)

pipe.transform(['Carne bovina 30 gr', 'leite de vaca 1lt', 'ovos 30 und'])

['carne bovina 30_grama', 'leite de vaca 1_litro', 'ovos 30 unidade']