# Lesson 1: Tokenization

In [None]:
import spacy
print(spacy.__name__, spacy.__version__)

spacy 3.7.5


In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
s = "Noah doesn't like to run when it rains."

doc = nlp(s)

In [None]:
print([t.text for t in doc])

['Noah', 'does', "n't", 'like', 'to', 'run', 'when', 'it', 'rains', '.']


In [None]:
print(doc[0])
print(doc[0:3])

Noah
Noah doesn't


In [None]:
print(type(doc))
print(type(doc[0]))
print(type(doc[0:3]))

<class 'spacy.tokens.doc.Doc'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.span.Span'>


In [None]:
print(doc[3].text)
print(doc[3].lang_)
print(doc[3].__len__())

like
en
4


In [None]:
print([[t.text, t.i] for t in doc])

[['Noah', 0], ['does', 1], ["n't", 2], ['like', 3], ['to', 4], ['run', 5], ['when', 6], ['it', 7], ['rains', 8], ['.', 9]]


In [None]:
s = "Hello! It's me Adam. We have met before."
doc = nlp(s)
sents = list(doc.sents)
print(sents)

[Hello!, It's me Adam., We have met before.]


# Lesson : Pre_Processing

case folding

In [None]:
s = "The train to London leaves at 10am on Tuseday"
doc = nlp(s)

print("Before preprocessing with case folding:")
print([t.text for t in doc])

print("After preprocessing with case folding:")
print([t.lower_ for t in doc])

print("Case folding but skip the first word:")
print([t.lower_ if not t.is_sent_start else t.text for t in doc])

Before preprocessing with case folding:
['The', 'train', 'to', 'London', 'leaves', 'at', '10', 'am', 'on', 'Tuseday']
After preprocessing with case folding:
['the', 'train', 'to', 'london', 'leaves', 'at', '10', 'am', 'on', 'tuseday']
Case folding but skip the first word:
['The', 'train', 'to', 'london', 'leaves', 'at', '10', 'am', 'on', 'tuseday']


stop word removal

In [None]:
print("All the stop word in EN:")
print(nlp.Defaults.stop_words)
print(len(nlp.Defaults.stop_words))

All the stop word in EN:
{'here', 'much', 'all', '‘ll', 'at', 'indeed', 'latter', 'towards', 'yourself', 'thence', 'meanwhile', 'hereupon', 'go', 'have', 'part', 'against', 'six', 'used', 'or', 'might', 'see', 'not', 'mine', 'besides', 'none', 'between', "'ll", 'done', 'more', '‘ve', 'wherever', 'fifty', 'both', 'whereas', 'you', 'over', 'hers', 'her', 'moreover', 'nor', 'bottom', 'next', 'these', "'re", 'about', 'too', 'whereafter', 'whoever', 'as', 'what', 'top', 'together', 'whom', 'thereafter', 'also', "'d", 'sixty', 'whereby', "'ve", 'namely', 'forty', 'can', 'nine', 'could', 'how', 'everywhere', 'move', 'without', 'quite', "n't", 'ever', 'only', 'several', 'made', 'he', 'after', 'becoming', 'due', 'am', 'now', 'thru', 'whence', 'themselves', '’d', 'my', '‘d', 'regarding', 'been', 'no', 'become', 'yourselves', 'even', 'every', 'get', 'of', 'give', 'amount', 'will', 'his', 'often', 'serious', 'down', 'therein', 'really', 'among', 'full', 'myself', 'per', 'ourselves', 'very', 'every

In [None]:
print("Before using stop word removal:")
print([t.text for t in doc])

print("After using stop word removal:")
print([t.text for t in doc if not t.is_stop])

Before using stop word removal:
['The', 'train', 'to', 'London', 'leaves', 'at', '10', 'am', 'on', 'Tuseday']
After using stop word removal:
['train', 'London', 'leaves', '10', 'Tuseday']


In [None]:
print("Customize stop word list:")
#nlp.Default.stop_words.add("ergo")
#nlp.Default.stop_words.remove("whatever")

lemmatization

In [None]:
s = "She is the fastest swimmer."

doc = nlp(s)

In [None]:
print([(t.text,t.lemma_) for t in doc])

[('She', 'she'), ('is', 'be'), ('the', 'the'), ('fastest', 'fast'), ('swimmer', 'swimmer'), ('.', '.')]


part of speech tagging

In [None]:
s = "I love pizza."

doc = nlp(s)

print([(t.text, t.pos_) for t in doc])

[('I', 'PRON'), ('love', 'VERB'), ('pizza', 'NOUN'), ('.', 'PUNCT')]


In [None]:
print([[t.pos_, spacy.explain(t.pos_)] for t in doc])

[['PRON', 'pronoun'], ['VERB', 'verb'], ['NOUN', 'noun'], ['PUNCT', 'punctuation']]


Named Entity Recognition (NER)

In [None]:
s = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(s)

In [None]:
print([[t.text, t.ent_type_] for t in doc])

[['Apple', 'ORG'], ['is', ''], ['looking', ''], ['at', ''], ['buying', ''], ['U.K.', 'GPE'], ['startup', ''], ['for', ''], ['$', 'MONEY'], ['1', 'MONEY'], ['billion', 'MONEY']]


In [None]:
print([[t.text, t.ent_type_] for t in doc if t.ent_type_ != 0])

[['Apple', 'ORG'], ['is', ''], ['looking', ''], ['at', ''], ['buying', ''], ['U.K.', 'GPE'], ['startup', ''], ['for', ''], ['$', 'MONEY'], ['1', 'MONEY'], ['billion', 'MONEY']]


In [None]:
print('ORG:', spacy.explain('ORG'))
print('GPE:', spacy.explain('GPE'))
print('MONEY:', spacy.explain('MONEY'))

ORG: Companies, agencies, institutions, etc.
GPE: Countries, cities, states
MONEY: Monetary values, including unit


In [None]:
print([[t.text, t.label_] for t in doc.ents])

[['Apple', 'ORG'], ['U.K.', 'GPE'], ['$1 billion', 'MONEY']]


In [None]:
from spacy import displacy

displacy.render(doc, style = 'ent')

# Lesson 3: BOW and similarity

### Bag_Of_Words (BOW)

BOW using sklearn

In [None]:
import spacy
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
corpus = [
    "Inflation surges around the world.",
    "The Omicron coronavirus variant spreads.",
    "World population exceeds 8 billion.",
    "AI predicts protein structures."
]

In [None]:
vectorizer = CountVectorizer()

bow = vectorizer.fit_transform(corpus)

In [None]:
vectorizer.vocabulary_

{'inflation': 5,
 'surges': 12,
 'around': 1,
 'the': 13,
 'world': 15,
 'omicron': 6,
 'coronavirus': 3,
 'variant': 14,
 'spreads': 10,
 'population': 7,
 'exceeds': 4,
 'billion': 2,
 'ai': 0,
 'predicts': 8,
 'protein': 9,
 'structures': 11}

In [None]:
bow

<4x16 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [None]:
print(bow)

  (0, 5)	1
  (0, 12)	1
  (0, 1)	1
  (0, 13)	1
  (0, 15)	1
  (1, 13)	1
  (1, 6)	1
  (1, 3)	1
  (1, 14)	1
  (1, 10)	1
  (2, 15)	1
  (2, 7)	1
  (2, 4)	1
  (2, 2)	1
  (3, 0)	1
  (3, 8)	1
  (3, 9)	1
  (3, 11)	1


In [None]:
bow.toarray()

array([[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0]])

custom tokenizer with spacy

In [None]:
def custom_tokenizer(doc):
  return [t.lemma_ for t in nlp(doc) if (not t.is_stop) and (not t.is_punct)]

In [None]:
vectorizer = CountVectorizer(tokenizer = custom_tokenizer, binary = True)

bow = vectorizer.fit_transform(corpus)



In [None]:
vectorizer.vocabulary_

{'inflation': 5,
 'surge': 12,
 'world': 14,
 'omicron': 6,
 'coronavirus': 3,
 'variant': 13,
 'spread': 10,
 'population': 7,
 'exceed': 4,
 '8': 0,
 'billion': 2,
 'ai': 1,
 'predict': 8,
 'protein': 9,
 'structure': 11}

In [None]:
print(bow[:, 0:4])

  (1, 3)	1
  (2, 0)	1
  (2, 2)	1
  (3, 1)	1


document similarity

In [None]:
def cosine_sim(a,b):
  return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))


In [None]:
print(corpus[1])
print(bow[1])
print("=> ", bow[1].toarray().squeeze())

The Omicron coronavirus variant spreads.
  (0, 6)	1
  (0, 3)	1
  (0, 13)	1
  (0, 10)	1
=>  [0 0 0 1 0 0 1 0 0 0 1 0 0 1 0]


In [None]:
print(corpus[1])
print(corpus[3])
print(f'Similarity score: {cosine_sim(bow[1].toarray().squeeze(),bow[3].toarray().squeeze()):.3f}')

The Omicron coronavirus variant spreads.
AI predicts protein structures.
Similarity score: 0.000


In [None]:
print(corpus[0])
print(corpus[2])
print(f'Similarity score: {cosine_sim(bow[0].toarray().squeeze(),bow[2].toarray().squeeze()):.3f}')

Inflation surges around the world.
World population exceeds 8 billion.
Similarity score: 0.258


In [None]:
print(cosine_similarity(bow))

[[1.         0.         0.25819889 0.        ]
 [0.         1.         0.         0.        ]
 [0.25819889 0.         1.         0.        ]
 [0.         0.         0.         1.        ]]


n-grams

In [None]:
vectorizer = CountVectorizer(tokenizer = custom_tokenizer, lowercase = False, binary = True, ngram_range = (1,2))

unibigrams = vectorizer.fit_transform(corpus)

print(f'Size of vocabulary: {len(vectorizer.get_feature_names_out())}')

print(vectorizer.vocabulary_)

for i in vectorizer.vocabulary_:
  print(i)

Size of vocabulary: 27
{'inflation': 11, 'surge': 21, 'world': 25, 'inflation surge': 12, 'surge world': 22, 'Omicron': 4, 'coronavirus': 7, 'variant': 23, 'spread': 19, 'Omicron coronavirus': 5, 'coronavirus variant': 8, 'variant spread': 24, 'population': 13, 'exceed': 9, '8': 0, 'billion': 6, 'world population': 26, 'population exceed': 14, 'exceed 8': 10, '8 billion': 1, 'AI': 2, 'predict': 15, 'protein': 17, 'structure': 20, 'AI predict': 3, 'predict protein': 16, 'protein structure': 18}
inflation
surge
world
inflation surge
surge world
Omicron
coronavirus
variant
spread
Omicron coronavirus
coronavirus variant
variant spread
population
exceed
8
billion
world population
population exceed
exceed 8
8 billion
AI
predict
protein
structure
AI predict
predict protein
protein structure




# TF_IDF

## TF-IDF with sklearn

In [None]:
import spacy
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
corpus = fetch_20newsgroups(categories = ['sci.space'], remove = ('headers','footers','quotes'))

print(len(corpus.data))
print(corpus.data[0])

593

Any lunar satellite needs fuel to do regular orbit corrections, and when
its fuel runs out it will crash within months.  The orbits of the Apollo
motherships changed noticeably during lunar missions lasting only a few
days.  It is *possible* that there are stable orbits here and there --
the Moon's gravitational field is poorly mapped -- but we know of none.

Perturbations from Sun and Earth are relatively minor issues at low
altitudes.  The big problem is that the Moon's own gravitational field
is quite lumpy due to the irregular distribution of mass within the Moon.


### pre_processing
apply lemmatization, remove punctuation, spaces and non-alphabetic characters.

In [None]:
nlp = spacy.load('en_core_web_sm')

unwanted_pipes = ["ner", 'parser']

def custom_tokenizer(doc):
  with nlp.disable_pipes(*unwanted_pipes):
    return [t.lemma_ for t in nlp(doc) if not t.is_punct and not t.is_space and t.is_alpha]

In [None]:
vectorizer = TfidfVectorizer(tokenizer = custom_tokenizer)

features = vectorizer.fit_transform(corpus.data)



In [None]:
print(features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 56690 stored elements and shape (593, 9463)>
  Coords	Values
  (0, 424)	0.07006735123597327
  (0, 4943)	0.17755697785104502
  (0, 7310)	0.08827255510573831
  (0, 5573)	0.07462737620371114
  (0, 3317)	0.1987888389166129
  (0, 8517)	0.06551158102003457
  (0, 2378)	0.04343054547334542
  (0, 6912)	0.13559878838138195
  (0, 5908)	0.21554277358564625
  (0, 1847)	0.13559878838138195
  (0, 370)	0.1054358136369086
  (0, 9237)	0.0715855496878138
  (0, 4402)	0.07522156165875085
  (0, 7244)	0.0978911139378133
  (0, 5963)	0.0643662961391887
  (0, 4393)	0.07654434326236456
  (0, 9274)	0.059872496633831214
  (0, 1902)	0.13559878838138195
  (0, 9311)	0.1929427392927135
  (0, 5402)	0.10099174099290609
  (0, 8393)	0.20401777246040834
  (0, 5817)	0.09912761029075574
  (0, 449)	0.10452131953855516
  (0, 5429)	0.17101697764367227
  (0, 1348)	0.09035933266335426
  :	:
  (592, 5577)	0.0608786291592942
  (592, 3709)	0.05774845667440134
  (592, 1214

## Document Search

In [None]:
query = ["Mars"]

query_tfidf = vectorizer.transform(query)

print(query_tfidf)

  (0, 5026)	1.0


In [None]:
cosine_similarities = cosine_similarity(features, query_tfidf).flatten()

In [None]:
def top_k(arr, k):
  kth_largest = (k+1)*-1
  return np.argsort(arr)[:kth_largest:-1]

top_related_indices = top_k(cosine_similarities, 5)
print(top_related_indices)

[468 583 410  79 343]


In [None]:
print(cosine_similarities[top_related_indices])

[0.32658502 0.1810773  0.15383114 0.14742523 0.14398152]


In [None]:
print(corpus.data[top_related_indices[0]])

What is the deal with life on Mars?  I save the "face" and heard 
associated theories. (which sound thin to me)

Are we going back to Mars to look at this face agian?
Does anyone buy all the life theories?



In [None]:
print(corpus.data[top_related_indices[1]])


A practical suggestion, to be sure, but one could *also* peek into
news.lists, where Brian Reid has posted "USENET Readership report for
Mar 93." Another posting called "USENET READERSHIP SUMMARY REPORT FOR
MAR 93" gives the methodology and caveats of Reid's survey.  (These
postings failed to appear for a while-- I wonder why?-- but they are
now back.)

Reid, alas, gives us no measure of the "power/influence" of readers...
Sorry, Mark.

I suspect Mark, dangling out there on Fidonet, may not get news.lists
so I've mailed him copies of these reports.

The bottom line?

        +-- Estimated total number of people who read the group, worldwide.
        |     +-- Actual number of readers in sampled population
        |     |     +-- Propagation: how many sites receive this group at all
        |     |     |      +-- Recent traffic (messages per month)
        |     |     |      |      +-- Recent traffic (kilobytes per month)
        |     |     |      |      |      +-- Crossposting percen

In [None]:
a = [1,2,3,4,5,6]
b = a[:-7:-1]
print(b)

[6, 5, 4, 3, 2, 1]
