In [1]:
from nltk.corpus import brown

In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
data = brown.sents(categories='editorial')[:100]
print(type(data), len(data))
print(data)
print(len(data))

<class 'nltk.collections.LazySubsequence'> 100
[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]
100


In [5]:
text = "It was a very pleasant day, the weather was cool and there were showers. I went to market to buy some fruits."

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [7]:
sents = sent_tokenize(text)
sents

['It was a very pleasant day, the weather was cool and there were showers.',
 'I went to market to buy some fruits.']

In [8]:
word_list = word_tokenize(sents[0].lower())

In [9]:
word_list

['it',
 'was',
 'a',
 'very',
 'pleasant',
 'day',
 ',',
 'the',
 'weather',
 'was',
 'cool',
 'and',
 'there',
 'were',
 'showers',
 '.']

In [10]:
from nltk.corpus import stopwords

In [11]:
sw = set(stopwords.words('english'))

In [12]:
print(sw,len(sw))

{'during', 'until', 'the', 'he', 'once', 'who', 'ma', 'but', 'out', 'won', 'being', 'why', 'will', "didn't", 'if', 'weren', "she's", 'theirs', 'before', 'after', 'aren', 'doesn', 'do', 'our', 'with', 'own', 'don', 'wouldn', 'i', 'while', 'in', 'yours', 'had', 'hers', 'mustn', 'that', 'all', "won't", 'off', 'under', "hadn't", "weren't", "you'd", 'there', 'm', 'these', 'mightn', 've', 'you', 'are', "shan't", 'didn', 'your', 'having', 'few', 'my', 'themselves', 'o', 'no', "mustn't", 'through', "wouldn't", 'myself', 'against', 'yourselves', 'it', 'such', 'me', 'some', 'both', 'hasn', 'any', 'couldn', 'shouldn', 'more', "that'll", 'yourself', 'have', 'did', 'can', 'll', "you'll", 'her', 'which', "isn't", "hasn't", 'were', 'them', 'then', 'just', 'has', 'here', 'each', 'most', 'above', "you've", "it's", 'so', "you're", 'is', 'should', 'haven', 'than', 'further', 'ourselves', 'how', 'not', 'when', 'over', 'of', 'needn', 'for', 'was', 'up', 'am', 'an', "needn't", 'shan', 'ain', 'his', 'whom', 

In [13]:
def filter_words(word_list):
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

In [14]:
useful_words = filter_words(word_list)
useful_words

['pleasant', 'day', ',', 'weather', 'cool', 'showers', '.']

In [15]:
from nltk.tokenize import RegexpTokenizer

In [16]:
tokenizer = RegexpTokenizer("[a-zA-Z0-9]+")

In [17]:
sents = "send the 50 documents to abc, def, ghi."
print(tokenizer.tokenize(sents))

['send', 'the', '50', 'documents', 'to', 'abc', 'def', 'ghi']


In [18]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [19]:
ps = PorterStemmer()

In [20]:
ps.stem("jumped")
ps.stem("jumping")

'jump'

In [21]:
ps.stem("lovely")

'love'

In [22]:
ps.stem("awesome")
ls = LancasterStemmer()
ls.stem("awesome")

print(ls.stem("teenager"))
print(ps.stem("teenager"))

teen
teenag


In [23]:
ss = SnowballStemmer('spanish')
ss.stem('buano')

'buan'

# bag of words

In [24]:
corpus = [
    'Indian team will win today',
    'Lockdown expected to end by May 2019',
    'Colleges ans schools are closed due to coronavirus pandemic',
    'There is nothing to talk other than corona virus'
]

In [25]:
print(corpus)

['Indian team will win today', 'Lockdown expected to end by May 2019', 'Colleges ans schools are closed due to coronavirus pandemic', 'There is nothing to talk other than corona virus']


In [27]:
def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return filter_words(words)

list_words = myTokenizer(corpus[0])
print(len(list_words))

4


In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
cv = CountVectorizer(tokenizer = myTokenizer,ngram_range = (1,1) )

In [30]:
vectorized_corpus = cv.fit_transform(corpus)

In [31]:
vc = vectorized_corpus.toarray()

In [32]:
print(vc)
print(cv.vocabulary_)

[[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1]
 [1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0]
 [0 1 1 1 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0]]
{'indian': 9, 'team': 16, 'win': 19, 'today': 17, 'lockdown': 10, 'expected': 8, 'end': 7, 'may': 11, '2019': 0, 'colleges': 3, 'ans': 1, 'schools': 14, 'closed': 2, 'due': 6, 'coronavirus': 5, 'pandemic': 13, 'nothing': 12, 'talk': 15, 'corona': 4, 'virus': 18}


In [33]:
cv.inverse_transform(vc)

[array(['indian', 'team', 'today', 'win'], dtype='<U11'),
 array(['2019', 'end', 'expected', 'lockdown', 'may'], dtype='<U11'),
 array(['ans', 'closed', 'colleges', 'coronavirus', 'due', 'pandemic',
        'schools'], dtype='<U11'),
 array(['corona', 'nothing', 'talk', 'virus'], dtype='<U11')]

# TF-IDF

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = myTokenizer, ngram_range = (1,2))

In [36]:
vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)
print(tfidf_vectorizer.vocabulary_)

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.37796447
  0.37796447 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.37796447 0.37796447 0.37796447 0.         0.37796447 0.37796447]
 [0.33333333 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.33333333 0.33333333 0.33333333 0.33333333 0.
  0.         0.33333333 0.33333333 0.33333333 0.33333333 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.2773501  0.2773501  0.2773501  0.2773501  0.2773501
  0.2773501  0.         0.         0.2773501  0.2773501  0.2773501
  0.2773501  0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
