In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train_x = ["i love the book", "this is a great book", "the fit is great", "i love the shoes"]

In [3]:
vectorizers = CountVectorizer(binary=True, ngram_range=(1,2))
train_x_vectors = vectorizers.fit_transform(train_x)

print(vectorizers.get_feature_names_out())

print(train_x_vectors.toarray())

['book' 'fit' 'fit is' 'great' 'great book' 'is' 'is great' 'love'
 'love the' 'shoes' 'the' 'the book' 'the fit' 'the shoes' 'this'
 'this is']
[[1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0]
 [1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1]
 [0 1 1 1 0 1 1 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0]]


In [4]:
class Category:
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"

train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

In [5]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [6]:
test_x = vectorizers.transform(["i love the story"])

clf_svm.predict(test_x)

array(['CLOTHING'], dtype='<U8')

## Word Vectors

In [7]:
import spacy

In [8]:
nlp = spacy.load("en_core_web_md")

In [9]:
docs = [nlp(text) for text in train_x]

train_x_word_vectors = [x.vector for x in docs]

In [10]:
clf_svm_wv = svm.SVC(kernel='linear')

clf_svm_wv.fit(train_x_word_vectors, train_y)

SVC(kernel='linear')

In [11]:
test_x = ["i love the story"]
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors = [x.vector for x in test_docs]

clf_svm_wv.predict(test_x_word_vectors)

array(['BOOKS'], dtype='<U8')

In [12]:
import re 

regexp = re.compile(r"^ab[^\s]*cd$")

phrases = ["abcd", "xxx", "abxxxcd", "ab cd"]

matches = []

for phrase in phrases:
    if re.match(regexp, phrase):
        matches.append(phrase)
        
print(matches)

['abcd', 'abxxxcd']


In [13]:
regexp = re.compile(r"ab[^\s]*cd")

phrases = ["abcd", "xxx", "aaa abxxxcd ccc", "ab cd"]

matches = []

for phrase in phrases:
    if re.search(regexp, phrase):
        matches.append(phrase)
        
print(matches)

['abcd', 'aaa abxxxcd ccc']


In [14]:
regexp = re.compile(r"read|story|book")

phrases = ["I like that story", "i like that book", "this hat is nice"]

matches = []

for phrase in phrases:
    if re.search(regexp, phrase):
        matches.append(phrase)

print(matches)

['I like that story', 'i like that book']


## Stemming/Lemmatization

In [15]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

phrase = "reading the books."
words = word_tokenize(phrase)

stemmed_words = []
for word in words:
    stemmed_words.append(stemmer.stem(word))

" ".join(stemmed_words)

'read the book .'

In [17]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = "reading the books."
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
    lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))

" ".join(lemmatized_words)

'read the book .'

## Stopword Removal

In [18]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

phrase = "Here is an example sentence demonstrating the removal of stopwords"

words = word_tokenize(phrase)

stripped_phrase = []

for word in words:
    if word not in stop_words:
        stripped_phrase.append(word)
        
" ".join(stripped_phrase)

'Here example sentence demonstrating removal stopwords'

In [19]:
!pip install textblob

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [20]:
!python -m textblob.download_corpora

Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\86183\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!

In [21]:
from textblob import TextBlob

phrase = "this ise an examplee"

tb_phrase = TextBlob(phrase)

print(tb_phrase.correct())

print(tb_phrase.tags)

print(tb_phrase.sentiment)

this is an example
[('this', 'DT'), ('ise', 'NN'), ('an', 'DT'), ('examplee', 'NN')]
Sentiment(polarity=0.0, subjectivity=0.0)


## Transformer Architecture

In [31]:
!pip install spacy-transformers

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [32]:
!python -m spacy download en_trf_bertbaseuncased_lg


[x] No compatible package found for 'en_trf_bertbaseuncased_lg' (spaCy v3.4.3)



2023-01-12 10:57:22.042011: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2023-01-12 10:57:22.042049: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-12 10:57:24.601993: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2023-01-12 10:57:24.602718: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cublas64_11.dll'; dlerror: cublas64_11.dll not found
2023-01-12 10:57:24.603413: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cublasLt64_11.dll'; dlerror: cublasLt64_11.dll not found
2023-01-12 10:57:24.604105: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cu

In [33]:
import spacy
import torch

nlp = spacy.load("en_trf_bertbaseuncased_lg")
doc = nlp("Here is some texts to encode")

OSError: [E050] Can't find model 'en_trf_bertbaseuncased_lg'. It doesn't seem to be a Python package or a valid path to a data directory.