In [1]:
from maupassant.preprocessing.normalization import TextNormalization
from maupassant.preprocessing.tokenization import SentenceTokenization, SequenceTokenization

# Tokenization

## Transform sentence to tokens

In [2]:
sentence = "Let me tell you something you already know."

In [3]:
tokens = SentenceTokenization().tokenize(sentence)

In [4]:
tokens

['Let', 'me', 'tell', 'you', 'something', 'you', 'already', 'know', '.']

## Transform tokens to sentence

In [5]:
sentence = SentenceTokenization().detokenize(tokens)

In [6]:
sentence

'Let me tell you something you already know.'

## Transform sequence to sentences

In [7]:
sequence = "Let me tell you something you already know. The world ain’t all sunshine and rainbows."

In [8]:
sentences = SequenceTokenization().tokenize(sequence)

In [9]:
sentences

['Let me tell you something you already know.',
 'The world ain’t all sunshine and rainbows.']

In [13]:
SentenceTokenization().tokenize(sequence)

['Let',
 'me',
 'tell',
 'you',
 'something',
 'you',
 'already',
 'know',
 '.',
 'The',
 'world',
 'ain',
 '’',
 't',
 'all',
 'sunshine',
 'and',
 'rainbows',
 '.']

## Transform sentences to sequence

In [13]:
sequence = SequenceTokenization().detokenize(sentences)

In [14]:
sequence

'Let me tell you something you already know. The world ain’t all sunshine and rainbows.'

# Normalization

## Text correction

In [15]:
text = 'Let me tell you somthing you alrady know.'

In [16]:
cleaned_text = TextNormalization().text_correction(text)

In [17]:
cleaned_text

'Let me tell you something you already know.'

## Remove emoji/emot

In [18]:
text = 'Let me tell you something you already know 👍'

In [19]:
demojize_text = TextNormalization().text_demojis(text, how_replace="")

In [20]:
demojize_text

'Let me tell you something you already know '

In [21]:
text = 'Let me tell you something you already know :)'

In [22]:
demoticons_text = TextNormalization().text_demoticons(text, how_replace="")

In [23]:
demoticons_text

'Let me tell you something you already know '

## Decontract words

In [24]:
text = "I'd like to know yall guys"

In [25]:
decontraction_text = TextNormalization().text_decontraction(text)

In [26]:
decontraction_text

'I would like to know you all guys'

## Stem words

In [27]:
word = "shipping"

In [28]:
stemmed_word = TextNormalization().word_stemming(word)

In [36]:
stemmed_word

'ship'

# Clean a sentence

In [40]:
text = "I'd like to tell you somthing you alrady know."

In [41]:
decontraction_text = TextNormalization().text_decontraction(text)
cleaned_text = TextNormalization().text_correction(decontraction_text)

In [42]:
cleaned_text

'I would like to tell you something you already know.'