<a href="https://colab.research.google.com/github/MrSimple07/MachineLearning_Practice/blob/main/RandomNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Topic identification

import nltk
import requests
from nltk.corpus import stopwords
from gensim import corpora, models
from pprint import pprint
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

With LDA

In [None]:
#Random text
document = 'It is a very broad definition that can be summed in the following way. The word technology is derived from ‘techne’ and logia, two Greek words.Techne means science. It also stands for the philosophical meaning of using scientific knowledge. Logia means the art of using skills and different  techniques to accomplish a particular motive. After combining both the Greek words, the perfect definition is derived as — The art of using specific  scientific skills and knowledge to accomplish a motive of making an approach better and easier.'

stop_words= set(stopwords.words('english'))
tokens =[word for word in nltk.word_tokenize(document.lower()) if word.isalpha() and word not in stop_words]


dictionary = corpora.Dictionary([tokens])
corpus = [dictionary.doc2bow(tokens)]


#LDA model - Latent Dirichlet Allocation
lda_model = models.LdaModel(corpus, num_topics =1, id2word = dictionary, passes =10)
pprint(lda_model.print_topics())

[(0,
  '0.047*"using" + 0.035*"knowledge" + 0.035*"means" + 0.035*"skills" + '
  '0.035*"scientific" + 0.035*"motive" + 0.035*"logia" + 0.035*"accomplish" + '
  '0.035*"definition" + 0.035*"greek"')]


NLP with spaCy

- Using for basic text processing and pattern matching
- Building machine learning models with text
- Representing text with word embeddings

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp('The quick brown foxes are jumping over the lazy dogs. They have been running through the fields and forests, exploring the beauty of nature.')

In [None]:
for token in doc:
  print(token)

The
quick
brown
foxes
are
jumping
over
the
lazy
dogs
.
They
have
been
running
through
the
fields
and
forests
,
exploring
the
beauty
of
nature
.


In [None]:
#Lemmatizing and stop words

for token in doc:
  print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}")

The		the		True
quick		quick		False
brown		brown		False
foxes		fox		False
are		be		True
jumping		jump		False
over		over		True
the		the		True
lazy		lazy		False
dogs		dog		False
.		.		False
They		they		True
have		have		True
been		be		True
running		run		False
through		through		True
the		the		True
fields		field		False
and		and		True
forests		forest		False
,		,		False
exploring		explore		False
the		the		True
beauty		beauty		False
of		of		True
nature		nature		False
.		.		False


Pattern Matching

In [None]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab, attr ='LOWER')

In [None]:
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
patterns = [nlp(text) for text in terms]
matcher.add("TerminologyList", patterns)

In [None]:
text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.")
matches = matcher(text_doc)
print(matches)

[(3766102292120407359, 17, 19), (3766102292120407359, 22, 24), (3766102292120407359, 30, 32), (3766102292120407359, 33, 35)]


In [None]:
match_id, start, end = matches[0]
print(nlp.vocab.strings[match_id], text_doc[start:end])

TerminologyList iPhone 11


In [None]:
#Part of Speech (POS) tagging

for token in doc:
  print(token.text, token.pos_)

The DET
quick ADJ
brown ADJ
foxes NOUN
are AUX
jumping VERB
over ADP
the DET
lazy ADJ
dogs NOUN
. PUNCT
They PRON
have AUX
been AUX
running VERB
through ADP
the DET
fields NOUN
and CCONJ
forests NOUN
, PUNCT
exploring VERB
the DET
beauty NOUN
of ADP
nature NOUN
. PUNCT


In [None]:
#NER
for ent in text_doc.ents:
  print(ent.text, ent.label_)

10 Plus and last year DATE
iPhone XS ORG
Google Pixel ORG
3 CARDINAL


In [None]:
#Word Embeddings

for token in doc:
  print(token.text, token.vector[:5])

The [ 0.85183746 -0.29544368  0.6358199   1.5570921  -0.6759108 ]
quick [ 0.8100312 -1.2707639  1.0589166  1.1848149  0.2042971]
brown [ 0.59353954 -1.1236539   0.06441484  0.52956516  0.09440499]
foxes [-0.48174232  0.93088746 -0.5199428   0.48028803  1.6327056 ]
are [-0.9376364  -0.8994892  -0.07146559 -0.15708345 -0.4286011 ]
jumping [-0.04427019  0.78260726  0.01712537  0.27085647 -1.3030365 ]
over [ 0.08426314  0.90991944  0.04415579 -1.2731304  -0.38922563]
the [ 1.0078084   1.1961887   0.66080004  1.0293958  -0.02119049]
lazy [-0.3079457  -1.1043328   0.40270567  0.05805285 -0.73500675]
dogs [-1.020669    1.4565394   0.8399615   0.56590575  0.24345604]
. [-0.0751619  -0.62901324 -0.07042383 -1.621114   -0.543389  ]
They [-1.6231596  -0.48582357 -0.6938442   0.03227412 -0.39192843]
have [-0.32260102 -0.44645384  0.35678056 -0.99264956  0.360287  ]
been [-0.21974844 -0.17340861  0.6379559   0.29546565 -1.5078602 ]
running [-0.47258344  0.65876794  0.8013135   0.5232028  -0.6259217

NLTK

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

BoW and using it for Classification of the text

In [4]:
texts = ["I love this product", "This is a bad product", "I dislike this", "This is the best!"]
labels = [1, 0, 0, 1]

In [15]:
tokens = [' '.join(word_tokenize(text)) for text in texts]

#BoW vectorizer
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(tokens)

X_train, X_test, y_train, y_test = train_test_split(bow, labels, test_size = 0.3, random_state =23)
print(bow)

  (0, 4)	1
  (0, 7)	1
  (0, 5)	1
  (1, 7)	1
  (1, 5)	1
  (1, 3)	1
  (1, 0)	1
  (2, 7)	1
  (2, 2)	1
  (3, 7)	1
  (3, 3)	1
  (3, 6)	1
  (3, 1)	1


In [16]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)
print(accuracy_score(y_test, predictions))

0.5


BoW for sentiment analysis

In [22]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
import random

In [18]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [23]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [24]:
texts = [''.join(doc) for doc, _ in documents]
labels = [1 if category =='pos' else 0 for _, category in documents]

In [26]:
#BoW

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(texts)

X_train, X_test, y_train, y_test = train_test_split(bow, labels, test_size =0.3, random_state=23)

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)
print(accuracy_score(y_test, predictions))

0.645
