# Natural Language Processing

In [15]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np

In [8]:
newsgroups=fetch_20newsgroups()

In [9]:
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [10]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)

In [11]:
print("----------------------------\n".join(newsgroups_train.data[:3]))

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych----------------------------


Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries.----------------------------

 >In

In [16]:
print(np.array(newsgroups_train.target_names)[newsgroups_train.target[:3]])

['comp.graphics' 'talk.religion.misc' 'sci.space']


# Tokenizer

## Simple Split function

In [17]:
newsgroups_train.data[1].split()[:10]

['Seems',
 'to',
 'be,',
 'barring',
 'evidence',
 'to',
 'the',
 'contrary,',
 'that',
 'Koresh']

## NLTK

In [18]:
import nltk

In [19]:
nltk.word_tokenize(newsgroups_train.data[1])[:10]

['Seems', 'to', 'be', ',', 'barring', 'evidence', 'to', 'the', 'contrary', ',']

In [20]:
mytweet = "@john lol that was #awesome :)"
nltk.word_tokenize(mytweet)

['@', 'john', 'lol', 'that', 'was', '#', 'awesome', ':', ')']

In [21]:
#Although this behavior might be desirable in some cases, it’s most likely that we’d prefer for @ and john to be tokenized together as @john, 
#and # and awesome to be tokenized together as #awesome. 
#This is because we’d expect that word usage in the context of hastags or at-mentions is likely different from usage in plain text. 
#Moreover, we would prefer that : and ) to be tokenized together as :), as :) is certainly more informative (e.g. for sentiment analysis) than the sum of its parts.

# For custom tokenizations we use Regular Expressions which we will visit later

## Spacy

In [22]:
# pip install spacy
# ! python -m spacy download en_core_web_sm

In [23]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [24]:
tokens=[]
doc = nlp(newsgroups_train.data[1])
for token in doc:
    tokens.append(token.text)
print(tokens[:10])

['\n\n', 'Seems', 'to', 'be', ',', 'barring', 'evidence', 'to', 'the', 'contrary']


# Sentence Tokenization

# NLTK

In [25]:
from nltk.tokenize import sent_tokenize

In [26]:
newsgroups_train.data[1]

'\n\nSeems to be, barring evidence to the contrary, that Koresh was simply\nanother deranged fanatic who thought it neccessary to take a whole bunch of\nfolks with him, children and all, to satisfy his delusional mania. Jim\nJones, circa 1993.\n\n\nNope - fruitcakes like Koresh have been demonstrating such evil corruption\nfor centuries.'

In [30]:
sent_tokenize(newsgroups_train.data[1])

['\n\nSeems to be, barring evidence to the contrary, that Koresh was simply\nanother deranged fanatic who thought it neccessary to take a whole bunch of\nfolks with him, children and all, to satisfy his delusional mania.',
 'Jim\nJones, circa 1993.',
 'Nope - fruitcakes like Koresh have been demonstrating such evil corruption\nfor centuries.']

# Spacy

In [28]:
sentence=[]
doc = nlp(newsgroups_train.data[1])
for sent in doc.sents:
    sentence.append(sent.text)
sentence

['\n\nSeems to be, barring evidence to the contrary, that Koresh was simply\nanother deranged fanatic who thought it neccessary to take a whole bunch of\nfolks with him, children and all, to satisfy his delusional mania.',
 'Jim\nJones, circa 1993.\n\n\n',
 'Nope - fruitcakes like Koresh have been demonstrating such evil corruption\nfor centuries.']

# POS Tagging

## NLTK

In [34]:
#nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [32]:
from nltk import pos_tag

In [37]:
tokens_tag = pos_tag(newsgroups_train.data[1].split())
print(tokens_tag)

[('Seems', 'NNS'), ('to', 'TO'), ('be,', 'VB'), ('barring', 'VBG'), ('evidence', 'NN'), ('to', 'TO'), ('the', 'DT'), ('contrary,', 'NN'), ('that', 'IN'), ('Koresh', 'NNP'), ('was', 'VBD'), ('simply', 'RB'), ('another', 'DT'), ('deranged', 'VBN'), ('fanatic', 'NN'), ('who', 'WP'), ('thought', 'VBD'), ('it', 'PRP'), ('neccessary', 'JJ'), ('to', 'TO'), ('take', 'VB'), ('a', 'DT'), ('whole', 'JJ'), ('bunch', 'NN'), ('of', 'IN'), ('folks', 'NNS'), ('with', 'IN'), ('him,', 'JJ'), ('children', 'NNS'), ('and', 'CC'), ('all,', 'NN'), ('to', 'TO'), ('satisfy', 'VB'), ('his', 'PRP$'), ('delusional', 'JJ'), ('mania.', 'NN'), ('Jim', 'NNP'), ('Jones,', 'NNP'), ('circa', 'VBD'), ('1993.', 'CD'), ('Nope', 'NNP'), ('-', ':'), ('fruitcakes', 'NNS'), ('like', 'IN'), ('Koresh', 'NNP'), ('have', 'VBP'), ('been', 'VBN'), ('demonstrating', 'VBG'), ('such', 'JJ'), ('evil', 'JJ'), ('corruption', 'NN'), ('for', 'IN'), ('centuries.', 'NN')]


## Spacy

In [39]:
pos_tags=[]
doc = nlp(newsgroups_train.data[1])
for token in doc:
    pos_tags.append((token.text,token.pos_))
pos_tags

[('\n\n', 'SPACE'),
 ('Seems', 'VERB'),
 ('to', 'PART'),
 ('be', 'AUX'),
 (',', 'PUNCT'),
 ('barring', 'VERB'),
 ('evidence', 'NOUN'),
 ('to', 'ADP'),
 ('the', 'DET'),
 ('contrary', 'NOUN'),
 (',', 'PUNCT'),
 ('that', 'SCONJ'),
 ('Koresh', 'PROPN'),
 ('was', 'AUX'),
 ('simply', 'ADV'),
 ('\n', 'SPACE'),
 ('another', 'DET'),
 ('deranged', 'ADJ'),
 ('fanatic', 'NOUN'),
 ('who', 'PRON'),
 ('thought', 'VERB'),
 ('it', 'PRON'),
 ('neccessary', 'ADJ'),
 ('to', 'PART'),
 ('take', 'VERB'),
 ('a', 'DET'),
 ('whole', 'ADJ'),
 ('bunch', 'NOUN'),
 ('of', 'ADP'),
 ('\n', 'SPACE'),
 ('folks', 'NOUN'),
 ('with', 'ADP'),
 ('him', 'PRON'),
 (',', 'PUNCT'),
 ('children', 'NOUN'),
 ('and', 'CCONJ'),
 ('all', 'DET'),
 (',', 'PUNCT'),
 ('to', 'PART'),
 ('satisfy', 'VERB'),
 ('his', 'DET'),
 ('delusional', 'ADJ'),
 ('mania', 'NOUN'),
 ('.', 'PUNCT'),
 ('Jim', 'PROPN'),
 ('\n', 'SPACE'),
 ('Jones', 'PROPN'),
 (',', 'PUNCT'),
 ('circa', 'NOUN'),
 ('1993', 'NUM'),
 ('.', 'PUNCT'),
 ('\n\n\n', 'SPACE'),
 ('Nope

# Stemmer

In [41]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [42]:
stemmed=[]
for w in nltk.word_tokenize(newsgroups_train.data[1])[:10]:
    rootWord=ps.stem(w)
    stemmed.append((w,rootWord))
stemmed

[('Seems', 'seem'),
 ('to', 'to'),
 ('be', 'be'),
 (',', ','),
 ('barring', 'bar'),
 ('evidence', 'evid'),
 ('to', 'to'),
 ('the', 'the'),
 ('contrary', 'contrari'),
 (',', ',')]

# Lemmatizer

## NLTK

In [46]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [48]:
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [49]:
lemmatized_nltk=[]
for w in nltk.word_tokenize(newsgroups_train.data[1])[:10]:
    rootWord=wordnet_lemmatizer.lemmatize(w)
    lemmatized_nltk.append((w,rootWord))
lemmatized_nltk

[('Seems', 'Seems'),
 ('to', 'to'),
 ('be', 'be'),
 (',', ','),
 ('barring', 'barring'),
 ('evidence', 'evidence'),
 ('to', 'to'),
 ('the', 'the'),
 ('contrary', 'contrary'),
 (',', ',')]

## Spacy

In [45]:
lemmatized=[]
doc = nlp(newsgroups_train.data[1][:50])
for token in doc:
    lemmatized.append((token.text,token.lemma_))
lemmatized

[('\n\n', '\n\n'),
 ('Seems', 'seem'),
 ('to', 'to'),
 ('be', 'be'),
 (',', ','),
 ('barring', 'bar'),
 ('evidence', 'evidence'),
 ('to', 'to'),
 ('the', 'the'),
 ('contrary', 'contrary'),
 (',', ','),
 ('t', 't')]

# Stop Words

In [50]:
from spacy.lang.en.stop_words import STOP_WORDS

In [53]:
token_list=[]
doc = nlp(newsgroups_train.data[1][:50])
for token in doc:
    token_list.append(token.text)

In [55]:
filtered_sentence =[]
for word in token_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_sentence.append(word) 
filtered_sentence

['\n\n', ',', 'barring', 'evidence', 'contrary', ',', 't']

# References

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html#sklearn.datasets.fetch_20newsgroups

https://drive.google.com/file/d/1pWesiV90JdgqxpfkwRfuaf_PSM53-4hZ/view

https://github.com/hb20007/hands-on-nltk-tutorial/blob/master/1-2-Text-Analysis-Using-nltk.text.ipynb

https://github.com/hb20007/hands-on-nltk-tutorial/blob/master/3-4-Parts-of-Speech-and-Meaning.ipynb

https://spacy.io/usage/spacy-101

https://www.nltk.org/