## What is natural language processing?

In [74]:
x = 'was'
y = 'is'
x == y

False

## Lemmatization of words

In [75]:
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/gusw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/gusw/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [76]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemma_was = lemmatizer.lemmatize(x, pos='v')
lemma_is = lemmatizer.lemmatize(y, pos='v')

print(f"{lemma_was=}")
print(f"{lemma_is=}")
lemma_was == lemma_is

lemma_was='be'
lemma_is='be'


True

In [77]:
lemma1 = lemmatizer.lemmatize('vegetables', 'n')
lemma2 = lemmatizer.lemmatize('vegetable', 'v') # even though the PoS is incorrect here

print(f"{lemma1=}")
print(f"{lemma2=}")

lemma1='vegetable'
lemma2='vegetable'


## Lemmatization of Sentences

In [78]:
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/gusw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [79]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

sentence = 'Vegetables are types of plants.'

### Tokenizing sentences

In [80]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/gusw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [81]:
sentence_tokens = nltk.word_tokenize(sentence.lower())
sentence_tokens

['vegetables', 'are', 'types', 'of', 'plants', '.']

In [82]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gusw/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [83]:
pos_tags = nltk.pos_tag(sentence_tokens)
pos_tags

[('vegetables', 'NNS'),
 ('are', 'VBP'),
 ('types', 'NNS'),
 ('of', 'IN'),
 ('plants', 'NNS'),
 ('.', '.')]

In [84]:
import nltk 
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemma_me(sent):
    sentence_tokens = nltk.word_tokenize(sent.lower())
    pos_tags = nltk.pos_tag(sentence_tokens)

    sentence_lemmas = []
    for token, pos_tag in zip(sentence_tokens, pos_tags):
        if (pos := pos_tag[1][0].lower()) in ['n', 'v', 'a', 'r']:
            lemma = lemmatizer.lemmatize(token, pos=pos)
            sentence_lemmas.append(lemma)

    return sentence_lemmas

In [85]:
l1 = lemma_me('Vegetables are types of plants.')
l1

['vegetable', 'be', 'type', 'plant']

In [86]:
l2 = lemma_me('A vegetable is a type of plant')
l2 

['vegetable', 'be', 'type', 'plant']

In [87]:
l1 == l2

True

## Find the most similar sentence

In [88]:
text = 'Originally, vegetables were collected from the wild by hunter-gatherers. Vegetables are all plants. Vegetables can be eaten either raw or cooked.'
question = 'What are vegetables?' 

In [89]:
sentence_tokens = nltk.sent_tokenize(text)
sentence_tokens.append(question)
sentence_tokens

['Originally, vegetables were collected from the wild by hunter-gatherers.',
 'Vegetables are all plants.',
 'Vegetables can be eaten either raw or cooked.',
 'What are vegetables?']

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(tokenizer=lemma_me)
tv

In [91]:
tf = tv.fit_transform(sentence_tokens)
tf

<4x8 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [92]:
tf.toarray()

array([[0.27717414, 0.53114624, 0.        , 0.        , 0.53114624,
        0.53114624, 0.        , 0.27717414],
       [0.41988018, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.8046125 , 0.41988018],
       [0.32713399, 0.        , 0.62688384, 0.62688384, 0.        ,
        0.        , 0.        , 0.32713399],
       [0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678]])

In [93]:
import pandas

df = pandas.DataFrame(tf.toarray(), columns=tv.get_feature_names())
df



Unnamed: 0,be,collect,cook,eat,hunter-gatherer,originally,plant,vegetable
0,0.277174,0.531146,0.0,0.0,0.531146,0.531146,0.0,0.277174
1,0.41988,0.0,0.0,0.0,0.0,0.0,0.804612,0.41988
2,0.327134,0.0,0.626884,0.626884,0.0,0.0,0.0,0.327134
3,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.707107


In [94]:
from sklearn.metrics.pairwise import cosine_similarity

values = cosine_similarity(tf[-1], tf)
values

array([[0.39198343, 0.59380024, 0.46263733, 1.        ]])

In [95]:
values_flat = values.flatten()
print(f"{values_flat=}")

index = values_flat.argsort()[-2]
print(f"{index=}")

coefficient = values_flat[index]
print(f"{coefficient=}")


values_flat=array([0.39198343, 0.59380024, 0.46263733, 1.        ])
index=1
coefficient=0.593800244493221


In [96]:
if coefficient > 0.3:
    print(sentence_tokens[index])

Vegetables are all plants.


## Sentiment Analysis

In [97]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [98]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/gusw/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [99]:
analyzer = SentimentIntensityAnalyzer()
analyzer

<nltk.sentiment.vader.SentimentIntensityAnalyzer at 0x7fb35bece170>

In [100]:
text1 = "What a beautiful day! How amazing it is!"
analyzer.polarity_scores(text1) 

{'neg': 0.0, 'neu': 0.376, 'pos': 0.624, 'compound': 0.8513}

In [101]:
text2 = "This is a major piece of crap."
analyzer.polarity_scores(text2) 

{'neg': 0.342, 'neu': 0.658, 'pos': 0.0, 'compound': -0.3818}

In [102]:
from nltk.sentiment import SentimentIntensityAnalyzer

def has_positive_sentiment(text: str) -> bool:
    analyzer = SentimentIntensityAnalyzer()
    if (compound_value := analyzer.polarity_scores(text).get('compound', 0)) > 0:
        return True
    elif compound_value < 0:
        return False
    else:
        raise Exception(f'Could not extract compound from {text}')

In [103]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/gusw/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [104]:
random_texts = nltk.corpus.twitter_samples.strings()

In [105]:
tweet1 = random_texts[1045]
tweet1_is_positive = has_positive_sentiment(tweet1)
print(f"{tweet1=}")
print(f"{tweet1_is_positive=}")


tweet1='My phone is so shit, it always runs out of memory :( ...2 many nudes'
tweet1_is_positive=False
