In [10]:
!pip install spacy
!pip -m spacy download en_core_web_md




Usage:   
  pip <command> [options]

no such option: -m


In [7]:
!python -m spacy download en_core_web_md 

Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
     --------------------------------------- 33.5/33.5 MB 10.7 MB/s eta 0:00:00
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.3.0
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


2022-06-26 00:13:14.385351: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-06-26 00:13:14.386156: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Bag of words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
class Category:
        BOOKS = "BOOKS"
        CLOTHES = "CLOTHES"

train_x = [
        "I love the book", 
        "this is a great book", 
        "the fit is great", 
        "I love the shoes"]

train_y = [
        Category.BOOKS, 
        Category.BOOKS, 
        Category.CLOTHES, 
        Category.CLOTHES]

In [4]:
vectorizer = CountVectorizer(ngram_range=(1,2))
vectors = vectorizer.fit_transform(train_x)
print(vectorizer.get_feature_names())
print(vectors.toarray())

['book', 'fit', 'fit is', 'great', 'great book', 'is', 'is great', 'love', 'love the', 'shoes', 'the', 'the book', 'the fit', 'the shoes', 'this', 'this is']
[[1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0]
 [1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1]
 [0 1 1 1 0 1 1 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0]]




## Build a model

In [5]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(vectors, train_y)

### Train the model

In [6]:
test_x = vectorizer.transform(['I like the book'])
clf_svm.predict(test_x) 

array(['BOOKS'], dtype='<U7')

## Word vectors

In [7]:
import spacy
nlp = spacy.load("en_core_web_md")

In [8]:
print(train_x)

['I love the book', 'this is a great book', 'the fit is great', 'I love the shoes']


In [9]:
docs = [nlp(text) for text in train_x]
train_x_wv = [x.vector for x in docs]
print(docs)

[I love the book, this is a great book, the fit is great, I love the shoes]


In [10]:
print(docs[0].vector)

[-7.33089983e-01 -5.24749886e-03 -2.35488251e-01  1.59274936e-02
  9.66347754e-02  1.56278491e-01  1.38615012e-01 -1.82292491e-01
  8.84527490e-02  1.54077005e+00 -2.41762251e-01 -8.96672532e-02
  1.74057245e-01  3.10127772e-02  4.62116897e-02 -5.05267493e-02
 -1.48660004e-01  1.03792381e+00 -1.71565011e-01 -6.28000051e-02
  1.03982493e-01  1.28997505e-01  1.35554761e-01 -2.06535250e-01
 -2.21828252e-01 -1.54980987e-01 -2.25717485e-01 -2.63060927e-01
  2.91349851e-02  9.59425047e-02 -2.11517513e-02  3.45300019e-01
 -1.88805014e-01  1.19102523e-02  1.82815492e-01  1.35538995e-01
 -1.14783749e-01  2.49261260e-01 -1.00740008e-01  6.52624816e-02
 -1.29889250e-01  1.79949999e-02 -1.20909005e-01 -2.06174999e-02
  1.49652511e-01  1.26080498e-01  4.98107485e-02  1.36212513e-01
 -6.19465038e-02  1.98888257e-01 -1.23281501e-01  9.30762440e-02
 -8.31630006e-02 -1.11451503e-02  3.28723229e-02 -1.49444744e-01
 -3.78984734e-02 -1.56752497e-01 -1.67660996e-01 -1.64857253e-01
 -1.43127844e-01 -1.18127

In [11]:
clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_wv, train_y)

In [12]:
test_x = ["a paper"]
test_docs = [nlp(text) for text in test_x]
test_x_w_vectors = [x.vector for x in test_docs]
clf_svm_wv.predict(test_x_w_vectors)

array(['BOOKS'], dtype='<U7')

## Regular expression

In [13]:
import re
regex = re.compile(r"^ab[^\s]*cd$")
test_phrase = ["abcd", "azeo"]
matches = []
for phrase in test_phrase:
    if re.match(regex, phrase):
        matches.append(phrase)

print(matches)

['abcd']


In [14]:
regex = re.compile(r"\bread\b|write")
phrases = ["I like reading!", "oh my god", "Please read carefully"]
for phrase in phrases:
    if re.search(regex, phrase):
        print(phrase)

Please read carefully


## Stemming and lemmatization

In [15]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
phrase = "reading the books."

words = word_tokenize(phrase)
print(words)

stemmed_words = []

for word in words:
    stemmed_words.append(stemmer.stem(word))

" ".join(stemmed_words)

['reading', 'the', 'books', '.']


'read the book .'

In [17]:
from nltk.stem import WordNetLemmatizer
lemitizer = WordNetLemmatizer()

phrase = "reading the books."
words = word_tokenize(phrase)

lemitized_words = []
for word in words: 
    lemitized_words.append(lemitizer.lemmatize(word))


" ".join(lemitized_words)

'reading the book .'

## Stopword removal 

In [18]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stopwords = stopwords.words("english")
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
phrase = "This is an example of stopwords yeah"
words = word_tokenize(phrase)

stripped_phrase = []
for word in words:
    if word not in stopwords:
        stripped_phrase.append(word)

" ".join(stripped_phrase)

'This example stopwords yeah'

## Various other techniques (spell correction, sentiment, pos tagging)

In [20]:
from textblob import TextBlob
phrase = "this is an Exmple and it's bad"
tb_phrase = TextBlob(phrase)
tb_phrase.correct()
tb_phrase.sentiment

Sentiment(polarity=-0.6999999999999998, subjectivity=0.6666666666666666)

## Transformers architecture

In [31]:
nlp = spacy.load("en_trf_bertbaseuncased_lg")
doc = nlp("Here some text to encode")

class Category:
        BOOKS = "BOOKS"
        CLOTHES = "CLOTHES"

train_x = [
        "I love the book", 
        "this is a great book", 
        "the fit is great", 
        "I love the shoes"]

train_y = [
        Category.BOOKS, 
        Category.BOOKS, 
        Category.CLOTHES, 
        Category.CLOTHES]

OSError: [E050] Can't find model 'en_trf_bertbaseuncased_lg'. It doesn't seem to be a Python package or a valid path to a data directory.

In [25]:
!python -m spacy download en_trf_bertbaseuncased_lg


✘ No compatible package found for 'en_trf_bertbaseuncased_lg' (spaCy v3.3.1)



2022-06-26 02:00:31.319762: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-06-26 02:00:31.320324: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [32]:
!pip install spacy-transformers
!python -m spacy download en

⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 10.6 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


2022-06-26 02:04:07.037769: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-06-26 02:04:07.039031: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
