In [2]:
corpus = [
    "Natural Language Processing (NLP) is a subfield of Artificial Intelligence.",
    "It helps machines understand and generate human language.",
    "Common tasks include text classification, sentiment analysis, and machine translation.",
    "Deep learning models like RNNs, LSTMs, and Transformers are widely used in NLP.",
    "Tokenization is the process of splitting text into words or sentences.",
    "Stemming reduces words to their root form, while lemmatization uses vocabulary rules.",
    "Word embeddings like Word2Vec, GloVe, and FastText represent words as vectors.",
    "Transformers such as BERT and GPT have revolutionized NLP tasks.",
    "Named Entity Recognition (NER) identifies people, places, and organizations in text.",
    "Part-of-Speech (POS) tagging labels words as nouns, verbs, adjectives, etc.",
    "Sentiment analysis detects emotions or opinions expressed in text data.",
    "Text summarization generates concise versions of long documents.",
    "Machine translation converts text from one language to another automatically.",
    "Question answering systems provide direct answers to user queries.",
    "Chatbots use NLP to interact with humans in a conversational manner.",
    "Speech recognition converts spoken language into text.",
    "Language models predict the next word in a sequence of text.",
    "Topic modeling discovers hidden themes within large text collections.",
    "NLP is widely used in search engines, recommendation systems, and voice assistants.",
    "Ethical challenges in NLP include bias, fairness, and misinformation detection."
]


In [3]:
user_query = "What are the applications of Natural Language Processing in real life?"

In [4]:
from collections import Counter
import math

In [5]:
uq = user_query.lower().split(" ")
uq

['what',
 'are',
 'the',
 'applications',
 'of',
 'natural',
 'language',
 'processing',
 'in',
 'real',
 'life?']

In [6]:
document = "The various applications are sentiment analysis and much more are there"

In [7]:
doc = document.lower().split(" ")

In [8]:
uq_tokens = Counter(uq)
doc_tokens = Counter(doc)
doc_tokens

Counter({'are': 2,
         'the': 1,
         'various': 1,
         'applications': 1,
         'sentiment': 1,
         'analysis': 1,
         'and': 1,
         'much': 1,
         'more': 1,
         'there': 1})

In [9]:
uq_tokens

Counter({'what': 1,
         'are': 1,
         'the': 1,
         'applications': 1,
         'of': 1,
         'natural': 1,
         'language': 1,
         'processing': 1,
         'in': 1,
         'real': 1,
         'life?': 1})

In [10]:
for tokens in uq_tokens.keys() & doc_tokens.keys():
    print(tokens)

the
applications
are


In [11]:
mylist = []
for tokens in uq_tokens.keys() & doc_tokens.keys():
    mylist.append(uq_tokens[tokens]*doc_tokens[tokens])
mylist

[1, 1, 2]

In [12]:
dot_product = sum(mylist)

In [13]:
for tokens in uq_tokens.keys():
    print(uq_tokens[tokens])

1
1
1
1
1
1
1
1
1
1
1


In [14]:
query_magnitude = math.sqrt(sum(math.pow(uq_tokens[tokens],2) for tokens in uq_tokens.keys()))
query_magnitude

3.3166247903554

In [15]:
doc_magnitude = math.sqrt(sum(math.pow(doc_tokens[tokens],2) for tokens in doc_tokens.keys()))
doc_magnitude

3.605551275463989

In [16]:
similarity = dot_product/(query_magnitude*doc_magnitude)
similarity

0.3344968040028363

In [17]:
user_query = "generate human language"

In [18]:
def cosineSimilarity(query,document):
    query =  query.lower().split(" ")
    document =  document.lower().split(" ")
    query_new = Counter(query)
    document_new = Counter(document)
    mylist = []
    for tokens in query_new.keys() & document_new.keys():
        mylist.append(query_new[tokens]*document_new[tokens])
    dotProduct = sum(mylist)
    qmagnitude = math.sqrt(sum(math.pow(query_new[tokens],2) for tokens in query_new.keys()))
    dmagnitude = math.sqrt(sum(math.pow(document_new[tokens],2) for tokens in document_new.keys()))
    similarity = dot_product/(qmagnitude*dmagnitude)
    return similarity
cosineSimilarity(user_query,document)

0.6405126152203486

In [19]:
def forAll(query,corpus):
    simCorpus = []
    for doc in corpus:
        sim = cosineSimilarity(query,doc)
        simCorpus.append(sim)
    return simCorpus,corpus[simCorpus.index(max(simCorpus))]
simCorp,abc = forAll(user_query,corpus)
print(simCorp)
print(abc)

[0.7302967433402214, 0.8164965809277259, 0.7302967433402214, 0.6405126152203486, 0.6963106238227914, 0.6666666666666667, 0.6963106238227914, 0.7302967433402214, 0.6963106238227914, 0.7302967433402214, 0.7302967433402214, 0.8164965809277259, 0.7302967433402214, 0.769800358919501, 0.6963106238227914, 0.8728715609439696, 0.6963106238227914, 0.769800358919501, 0.6666666666666667, 0.7302967433402214]
Speech recognition converts spoken language into text.


In [20]:
print(corpus)

['Natural Language Processing (NLP) is a subfield of Artificial Intelligence.', 'It helps machines understand and generate human language.', 'Common tasks include text classification, sentiment analysis, and machine translation.', 'Deep learning models like RNNs, LSTMs, and Transformers are widely used in NLP.', 'Tokenization is the process of splitting text into words or sentences.', 'Stemming reduces words to their root form, while lemmatization uses vocabulary rules.', 'Word embeddings like Word2Vec, GloVe, and FastText represent words as vectors.', 'Transformers such as BERT and GPT have revolutionized NLP tasks.', 'Named Entity Recognition (NER) identifies people, places, and organizations in text.', 'Part-of-Speech (POS) tagging labels words as nouns, verbs, adjectives, etc.', 'Sentiment analysis detects emotions or opinions expressed in text data.', 'Text summarization generates concise versions of long documents.', 'Machine translation converts text from one language to another

In [21]:
import requests
import json

In [29]:
full_response = []
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences about two lines and do not include extra information.
This is the recommended activity: {document}
The user input is: {user_query}
Compile a recommendation to the user based on the recommended activity and the user input.
"""

url = 'http://localhost:11434/api/generate'


data = {
    "model": "gemma3:4b",
    "prompt": prompt.format(user_query=user_query, document=document)
}

headers = {'Content-Type': 'application/json'}

response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)


try:
    for line in response.iter_lines():
        # filter out keep-alive new lines
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            # print(decoded_line['response'])  # uncomment to results, token by token
            full_response.append(decoded_line['response'])
finally:
    response.close()
    
    
print(''.join(full_response))
# print(full_response)

Explore sentiment analysis applications. 
It offers diverse language processing tools.
