- nltk (Natural Languate Toolkit): https://www.nltk.org/
- dummy text generator: https://www.blindtextgenerator.com/lorem-ipsum

### nltk

In [1]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from collections import Counter

[nltk_data] Downloading package stopwords to /home/jkpark/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
documents = """Far far away, behind the word mountains, far from the countries Vokalia and Consonantia, there live the blind texts.

Separated they live in Bookmarksgrove right at the coast of the Semantics, a large language ocean.

A small river named Duden flows by their place and supplies it with the necessary regelialia.

It is a paradisematic country, in which roasted parts of sentences fly into your mouth.

Even the all-powerful Pointing has no control about the blind texts it is an almost unorthographic life One day however a small line of blind text by the name of Lorem Ipsum decided to leave for the far World of Grammar.

The Big Oxmox advised her not to do so, because there were thousands of bad Commas, wild Question Marks and devious Semikoli, but the Little Blind Text didn’t listen.

She packed her seven versalia, put her initial into the belt and made herself on the way.

When she reached the first hills of the Italic Mountains, she had a last view back on the skyline of her hometown Bookmarksgrove, the headline of Alphabet Village and the subline of her own road, the Line Lane.

Pityful a rethoric question ran over her cheek, then"""

doc_list = documents.split("\n\n")
df = pd.DataFrame(data=zip(doc_list, range(9)), columns=["text", "target"])
arr = np.array(df['text'])

print(arr.shape)
df

(9,)


Unnamed: 0,text,target
0,"Far far away, behind the word mountains, far f...",0
1,Separated they live in Bookmarksgrove right at...,1
2,A small river named Duden flows by their place...,2
3,"It is a paradisematic country, in which roaste...",3
4,Even the all-powerful Pointing has no control ...,4
5,"The Big Oxmox advised her not to do so, becaus...",5
6,"She packed her seven versalia, put her initial...",6
7,When she reached the first hills of the Italic...,7
8,Pityful a rethoric question ran over her cheek...,8


In [3]:
# make linearized tokens and stretched targets

tokens = []
targets = []
for doc, tgt in zip(arr, df['target']):
    token = word_tokenize(doc)
    tokens += token
    
    for _ in range(len(token)):
        targets.append(tgt)
        
print(len(tokens), "\n", tokens[:50], "\n")
print(len(targets), "\n", targets[:50], "\n")

# counts
counts = Counter(tokens)
print(len(counts), "\n", dict(counts))

223 
 ['Far', 'far', 'away', ',', 'behind', 'the', 'word', 'mountains', ',', 'far', 'from', 'the', 'countries', 'Vokalia', 'and', 'Consonantia', ',', 'there', 'live', 'the', 'blind', 'texts', '.', 'Separated', 'they', 'live', 'in', 'Bookmarksgrove', 'right', 'at', 'the', 'coast', 'of', 'the', 'Semantics', ',', 'a', 'large', 'language', 'ocean', '.', 'A', 'small', 'river', 'named', 'Duden', 'flows', 'by', 'their', 'place'] 

223 
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2] 

147 
 {'Far': 1, 'far': 3, 'away': 1, ',': 13, 'behind': 1, 'the': 19, 'word': 1, 'mountains': 1, 'from': 1, 'countries': 1, 'Vokalia': 1, 'and': 5, 'Consonantia': 1, 'there': 2, 'live': 2, 'blind': 3, 'texts': 2, '.': 8, 'Separated': 1, 'they': 1, 'in': 2, 'Bookmarksgrove': 2, 'right': 1, 'at': 1, 'coast': 1, 'of': 10, 'Semantics': 1, 'a': 5, 'large': 1, 'language': 1, 'ocean': 1, 'A': 1, 'small': 2, 'river':

In [4]:
# apply stopwords

eng_stopwords = set(stopwords.words('english') + [".", ",", "?", "!"])
print("lenth of stopwords: ", len(eng_stopwords), "\n")

tokens = []
targets = []
for doc, tgt in zip(arr, df['target']):
    token = word_tokenize(doc)
    token = [tk for tk in token if tk not in eng_stopwords]
    for _ in range(len(token)):
        targets.append(tgt)
    tokens += token
    
print(len(tokens), "\n", tokens[:50], "\n")
print(len(targets), "\n", targets[:50], "\n")

# counts
counts = Counter(tokens)
print(len(counts), "\n", dict(counts))

lenth of stopwords:  183 

114 
 ['Far', 'far', 'away', 'behind', 'word', 'mountains', 'far', 'countries', 'Vokalia', 'Consonantia', 'live', 'blind', 'texts', 'Separated', 'live', 'Bookmarksgrove', 'right', 'coast', 'Semantics', 'large', 'language', 'ocean', 'A', 'small', 'river', 'named', 'Duden', 'flows', 'place', 'supplies', 'necessary', 'regelialia', 'It', 'paradisematic', 'country', 'roasted', 'parts', 'sentences', 'fly', 'mouth', 'Even', 'all-powerful', 'Pointing', 'control', 'blind', 'texts', 'almost', 'unorthographic', 'life', 'One'] 

114 
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] 

106 
 {'Far': 1, 'far': 3, 'away': 1, 'behind': 1, 'word': 1, 'mountains': 1, 'countries': 1, 'Vokalia': 1, 'Consonantia': 1, 'live': 2, 'blind': 3, 'texts': 2, 'Separated': 1, 'Bookmarksgrove': 2, 'right': 1, 'coast': 1, 'Semantics': 1, 'large': 1, 'language': 1, 'ocean': 1, 'A': 1, 'small

In [5]:
# apply lowercase and stopwords

eng_stopwords = set(stopwords.words('english') + [".", ",", "?", "!"])
print("lenth of stopwords: ", len(eng_stopwords), "\n")

tokens = []
targets = []
for doc, tgt in zip(arr, df['target']):
    doc = doc.lower()
    token = word_tokenize(doc)
    token = [tk for tk in token if tk not in eng_stopwords]
    for _ in range(len(token)):
        targets.append(tgt)
    tokens += token
    
print(len(tokens), "\n", tokens[:50], "\n")
print(len(targets), "\n", targets[:50], "\n")

# counts
counts = Counter(tokens)
print(len(counts), "\n", dict(counts))

lenth of stopwords:  183 

109 
 ['far', 'far', 'away', 'behind', 'word', 'mountains', 'far', 'countries', 'vokalia', 'consonantia', 'live', 'blind', 'texts', 'separated', 'live', 'bookmarksgrove', 'right', 'coast', 'semantics', 'large', 'language', 'ocean', 'small', 'river', 'named', 'duden', 'flows', 'place', 'supplies', 'necessary', 'regelialia', 'paradisematic', 'country', 'roasted', 'parts', 'sentences', 'fly', 'mouth', 'even', 'all-powerful', 'pointing', 'control', 'blind', 'texts', 'almost', 'unorthographic', 'life', 'one', 'day', 'however'] 

109 
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] 

95 
 {'far': 4, 'away': 1, 'behind': 1, 'word': 1, 'mountains': 2, 'countries': 1, 'vokalia': 1, 'consonantia': 1, 'live': 2, 'blind': 4, 'texts': 2, 'separated': 1, 'bookmarksgrove': 2, 'right': 1, 'coast': 1, 'semantics': 1, 'large': 1, 'language': 1, 'ocean': 1, 'small': 2, 'river