In [349]:
# libraries
import os
import nltk
import nltk.corpus

In [350]:
print(os.listdir(nltk.data.find("corpora")))

In [351]:
nltk.corpus.gutenberg.fileids()

In [352]:
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')
hamlet

In [353]:
for word in hamlet[:500]:
    print(word, sep = " ", end = " ")

In [354]:
AI = "Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions. The term may also be applied to any machine that exhibits traits associated with a human mind such as learning and problem-solving. The ideal characteristic of artificial intelligence is its ability to rationalize and take actions that have the best chance of achieving a specific goal. A subset of artificial intelligence is machine learning, which refers to the concept that computer programs can automatically learn from and adapt to new data without being assisted by humans. Deep learning techniques enable this automatic learning through the absorption of huge amounts of unstructured data such as text, images, or video."

In [355]:
type(AI)

In [356]:
from nltk.tokenize import word_tokenize

In [357]:
AI_tokens = word_tokenize(AI)
AI_tokens

In [358]:
len(AI_tokens)

In [359]:
from nltk.probability import FreqDist
fdist = FreqDist()

In [360]:
for word in AI_tokens:
    fdist[word.lower()]+=1 
fdist

In [361]:
fdist_top10 = fdist.most_common(10)
fdist_top10

In [362]:
# how many paragraphs
from nltk.tokenize import blankline_tokenize
AI_blank = blankline_tokenize(AI)
len(AI_blank)

In [363]:
from nltk.util import bigrams, trigrams, ngrams

In [364]:
string = "The best and most beautiful things in the world cannot be seen or even touched, they must be felt with the heart"
quotes_tokens = nltk.word_tokenize(string)
quotes_tokens

In [365]:
quotes_bigrams = list(nltk.bigrams(quotes_tokens))
quotes_bigrams

In [366]:
quotes_trigrams = list(nltk.trigrams(quotes_tokens))
quotes_trigrams

In [367]:
quotes_ngrams = list(nltk.ngrams(quotes_tokens, 4)) # we can check 4, 5 etc tokens too with the parameter
quotes_ngrams

In [368]:
# Stemming
## Normalize words into its base form or root form

In [369]:
from nltk.stem import PorterStemmer
pst = PorterStemmer()

In [370]:
pst.stem("having")

In [371]:
words_to_stem = ["give", "giving", "given", "gave"]
for words in words_to_stem:
    print(words + ":" + pst.stem(words))

In [372]:
# LancasterStemmer is more agressive than PorterStemmer
from nltk.stem import LancasterStemmer
lst = LancasterStemmer()
for words in words_to_stem:
    print(words + ":" + lst.stem(words))

In [373]:
# Lemmatization
## Groups together different inflected forms of a word, called Lemma
## Somehow similart to Stemming, as it maps several words into one common root
## Output of Lemmatization is proper word
## For example, a Lemmatiser should map gone, going and went into go

In [374]:
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer
word_lem = WordNetLemmatizer()

In [375]:
word_lem.lemmatize("corpora")

In [376]:
for words in words_to_stem:
    print(words + ":" + word_lem.lemmatize(words))

In [377]:
from nltk.corpus import stopwords

In [378]:
stopwords.words("english")

In [379]:
# stopwords
len(stopwords.words("english"))

In [380]:
fdist_top10

In [381]:
import re
punctuation = re.compile(r'[-.?!,:;()|0-9]')

In [382]:
post_punctuation = []
for words in AI_tokens:
    word = punctuation.sub("", words) 
    if len(word)>0:
        post_punctuation.append(word)

In [383]:
post_punctuation

In [384]:
len(post_punctuation)

In [385]:
# POS: Parts of Speech
sent = "Timothy is a natural when it comes to drawing"
sent_tokens = word_tokenize(sent)

In [386]:
for token in sent_tokens:
    print(nltk.pos_tag([token]))

In [387]:
sent2 = "John is eating a delicious cake"
sent2_tokens = word_tokenize(sent2)
for token in sent2_tokens:
    print(nltk.pos_tag([token]))

In [388]:
# What are Named Entity Recognition?
# NER
from nltk import ne_chunk

In [389]:
NE_sent = "The US President stays in the WHITE HOUSE"

In [390]:
NE_tokens = word_tokenize(NE_sent)
NE_tags = nltk.pos_tag(NE_tokens)

In [391]:
NE_NER = ne_chunk(NE_tags)
print(NE_NER)

In [392]:
# Syntax: Principles, rules, process
### Syntax Tree is a tree representation of syntactic structure of sentences or strings

In [393]:
# Chunking
### Pick up individual pieces of information and grouping them into bigger pieces

In [394]:
new = "The big cat ate the little mouse who was after fresh cheese"
new_tokens = nltk.pos_tag(word_tokenize(new))
new_tokens

In [395]:
grammar_np = r"NP: {<DT>?<JJ>*<NN>}"

In [396]:
chunk_parser = nltk.RegexpParser(grammar_np)

In [397]:
chunk_result = chunk_parser.parse(new_tokens)
chunk_result

In [398]:
import pandas as pd
import numpy as np

In [399]:
from sklearn.feature_extraction.text import CountVectorizer

In [400]:
print(os.listdir(nltk.data.find("corpora")))

In [401]:
from nltk.corpus import movie_reviews 

In [402]:
print(movie_reviews.categories())

In [403]:
print(len(movie_reviews.fileids("pos")))
print(" ")
print(movie_reviews.fileids("pos"))

In [404]:
neg_rev = movie_reviews.fileids("neg")
len(neg_rew)

In [405]:
rev = nltk.corpus.movie_reviews.words("pos/cv000_29590.txt")
rev

In [406]:
# we can also use this below code instead of countvectorizer for having any issue.

In [407]:
rev_list = []

In [408]:
for rev in neg_rev:
    rev_text_neg = rev = nltk.corpus.movie_reviews.words(rev)
    review_one_string = " ".join(rev_text_neg)
    review_one_string = review_one_string.replace(" ," , ",")
    review_one_string = review_one_string.replace(" ." , ".")
    review_one_string = review_one_string.replace("\'" , "'")
    review_one_string = review_one_string.replace(" \'", "'")
    rev_list.append(review_one_string)                        

In [409]:
len(rev_list)

In [410]:
pos_rev = movie_reviews.fileids("pos")

In [411]:
for rev_pos in pos_rev:
    rev_text_pos = nltk.corpus.movie_reviews.words(rev_pos)
    review_one_string = " ".join(rev_text_neg)
    review_one_string = review_one_string.replace(" ," , ",")
    review_one_string = review_one_string.replace(" ." , ".")
    review_one_string = review_one_string.replace("\'" , "'")
    review_one_string = review_one_string.replace(" \'", "'")
    rev_list.append(review_one_string)                        

In [412]:
len(rev_list) # 2000 positive reviews

In [413]:
neg_targets = np.zeros((1000, ), dtype = np.int)
pos_targets = np.ones((1000, ), dtype = np.int)

In [414]:
target_list = []
for neg_tar in neg_targets:
    target_list.append(neg_tar)
for pos_tar in pos_targets:
    target_list.append(pos_tar)


In [415]:
len(target_list)

In [416]:
y = pd.Series(target_list)

In [417]:
type(y)

In [418]:
y.head()

In [419]:
from sklearn.feature_extraction.text import CountVectorizer

In [420]:
count_vect = CountVectorizer(lowercase = True, stop_words = "english", min_df = 2)

In [421]:
X_count_vect = count_vect.fit_transform(rev_list)

In [422]:
X_count_vect.shape

In [423]:
X_names = count_vect.get_feature_names()
X_names

In [424]:
X_count_vect = pd.DataFrame(X_count_vect.toarray(), columns = X_names)

In [425]:
X_count_vect.shape

In [426]:
X_count_vect.head()

In [427]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [428]:
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_count_vect, y, test_size = 0.25, random_state = 5)

In [429]:
X_train_cv.shape

In [430]:
X_test_cv.shape

In [431]:
from sklearn.naive_bayes import GaussianNB

In [432]:
gnb = GaussianNB()
y_pred_gnb = gnb.fit(X_train_cv, y_train_cv).predict(X_test_cv)

In [433]:
from sklearn.naive_bayes import MultinomialNB

In [434]:
clf_cv = MultinomialNB()

In [435]:
clf_cv.fit(X_train_cv, y_train_cv)

In [436]:
y_pred_cv = clf_cv.predict(X_test_cv)
type(y_pred_cv)

In [437]:
print(metrics.accuracy_score(y_test_cv, y_pred_cv))

In [438]:
score_clf_cv = confusion_matrix(y_test_cv, y_pred_cv)
score_clf_cv

# Reference
https://www.youtube.com/watch?v=05ONoGfmKvA