# Dependency Parsing

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

sentence = " NLP and CV are two modern examples of AI"

doc = nlp(sentence)

# Each word dependency label and the word it depends on

for token in doc:
    print(f"{token.text}, {token.dep_}, {token.head.text}")

 , dep, NLP
NLP, nsubj, are
and, cc, NLP
CV, conj, NLP
are, ROOT, are
two, nummod, examples
modern, amod, examples
examples, attr, are
of, prep, examples
AI, pobj, of


In [2]:
# print POS tags

for token in doc:

    print(f"Token : {token.text}, POS : {token.pos_}, DEP : {token.dep_}")


Token :  , POS : SPACE, DEP : dep
Token : NLP, POS : PROPN, DEP : nsubj
Token : and, POS : CCONJ, DEP : cc
Token : CV, POS : PROPN, DEP : conj
Token : are, POS : AUX, DEP : ROOT
Token : two, POS : NUM, DEP : nummod
Token : modern, POS : ADJ, DEP : amod
Token : examples, POS : NOUN, DEP : attr
Token : of, POS : ADP, DEP : prep
Token : AI, POS : PROPN, DEP : pobj


In [3]:
# Print NE

for ent in doc.ents:
    print(f"{ent.text}, {ent.label_}")

NLP, ORG
CV, GPE
two, CARDINAL
AI, GPE


In [5]:
# # Visualize parse tree colab

# from spacy import displacy

# displacy.render(doc, style="dep")

In [7]:
# # VS code

# html = displacy.render(doc, style="dep", page=True)

# with open("parse_tree.html", "w") as f:
#     f.write(html)

In [8]:
from IPython.display import display, HTML
from spacy import displacy

html = displacy.render(doc, style="dep", jupyter=False)
display(HTML(html))

In [9]:
print("\nSubject-verb pairs : ")

for token in doc:

    if token.dep_ in ("nsubj","nsubjpass"):

        print(f"Subject : {token.head.text} ----> Verb : {token.text}")


Subject-verb pairs : 
Subject : are ----> Verb : NLP


In [10]:
nouns = [token.text for token in doc if token.pos_ == "NOUN"]

verbs = [token.text for token in doc if token.pos_ == "VERB"]

print(f"Nouns : {nouns}")
print(f"Verbs : {verbs}")

Nouns : ['examples']
Verbs : []


# Deep Semantic Parsing

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "suriya7/t5-base-text-to-sql"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def translate_to_sql_select(english_query):

    input_text = "translate English to SQL: ", english_query

    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    outputs = model.generate(input_ids)

    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return sql_query


english_query = "Show all employees with salary above $5000"

sql_query = translate_to_sql_select(english_query)

print(f"English Query : {english_query}")
print(f"SQL Query : {sql_query}")

tokenizer_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

English Query : Show all employees with salary above $5000
SQL Query : SELECT SUM(employee) FROM table_name_94 W


In [12]:
english_query = "Print all student names who have PASSED the exam"

sql_query = translate_to_sql_select(english_query)

print(f"English Query : {english_query}")
print(f"SQL Query : {sql_query}")

English Query : Print all student names who have PASSED the exam
SQL Query : SELECT student_name FROM student


# Information Extraction

In [13]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Barack Hussein Obama was born on August 4, 1961 is an American politician who was the 44th president of the United States from 2009 to 2017."

doc = nlp(text)

print("Named Entities : ")

for ent in doc.ents:
    print(f"{ent.text}, {ent.label_}")

Named Entities : 
Barack Hussein Obama, PERSON
August 4, 1961, DATE
American, NORP
44th, ORDINAL
the United States, GPE
2009, DATE


In [14]:
print("\nNoun Chunks : ")

for chunk in doc.noun_chunks:
    print(f"{chunk.text}, {chunk.root.text}, {chunk.root.dep_}")


Noun Chunks : 
Barack Hussein Obama, Obama, nsubjpass
August, August, pobj
an American politician, politician, attr
who, who, nsubj
the 44th president, president, attr
the United States, States, pobj


In [15]:
print("\nVerbs and their Dependencies : ")

for token in doc:
    if token.pos_ == "VERB":

        subject = [child for child in token.children if child.dep_ == "nsubj"]

        obj = [child for child in token.children if child.dep_ in ("dobj","pobj")]

        print(f"Verb : {token.text}, Subject : {subject}, Object : {obj}")


Verbs and their Dependencies : 
Verb : born, Subject : [], Object : []


In [16]:
print("\nCustom extraction : Dates and Location : ")

for ent in doc.ents:
    if ent.label_ in ("DATE","GPE"):
        print(f"{ent.text}, {ent.label_}")


Custom extraction : Dates and Location : 
August 4, 1961, DATE
the United States, GPE
2009, DATE


In [17]:
# extractive text summerization using TF-IDF

import numpy as np
import pandas as pd
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [25]:
df = pd.read_csv("bbc-text.csv")

df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [30]:
doc = df[df.category == "business"]["text"].sample(frac=0.2, random_state=42)

doc.head()

2098    irish company hit by iraqi report shares in ir...
1972    bat spit drug firm goes to market a german fir...
2070    electronics firms eye plasma deal consumer ele...
1930    circuit city gets takeover offer circuit city ...
1589    weak dollar hits reuters revenues at media gro...
Name: text, dtype: object

In [31]:
def wrap(x):

    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

print(wrap(doc.iloc[0]))

irish company hit by iraqi report shares in irish oil company petrel
resources have lost more than 50% of their value on a report that the
firm has failed to win a contract in iraq.  reuters news agency
reported that iraq s oil ministry has awarded the first post-war
oilfield contracts to a canadian and a turkish company.  by 1700 gmt
petrel s shares fell from 97p ($1.87) to 44p ($0.85). petrel said that
it has not received any information from iraqi authorities to confirm
or deny the report.  iraq is seeking to award contracts for three
projects  valued at $500m (£258.5m). turkey s everasia is reported by
reuters to have won a contract to develop the khurmala dome field in
the north of the country.  a canadian company  named iog  is reported
to have won the contract to run the himrin field.  ironhorse oil and
gas has denied to reuters that it is the company in question.  these
two projects aim to develop khurmala field to produce 100 000 barrels
per day and raise the output of himrin.

In [32]:
text = doc.iloc[0]

if "\n" in text:

    content = text.split("\n",1)[1]

else:

    content = text

sents = sent_tokenize(content)

featurizer = TfidfVectorizer(stop_words=stopwords.words("english"), norm="l1")

X = featurizer.fit_transform(sents)

In [33]:
def get_sentence_score(tfidf_row):

    x = tfidf_row[tfidf_row != 0]

    return x.mean()

scores = np.zeros(len(sents))

for i in range(len(sents)):

    score = get_sentence_score(X[i,:])

    scores[i] = score

scores

array([0.05882353, 0.0625    , 0.11111111, 0.11111111, 0.1       ,
       0.09090909, 0.11111111, 0.14285714, 0.06666667, 0.11111111,
       0.08333333, 0.08333333, 0.07692308, 0.125     ])

In [34]:
sort_idx = np.argsort(-scores)

print('Generated Summary : ')

for i in sort_idx[:10]:

    print(wrap("%.2f: %s" % (scores[i], sents[i])))

Generated Summary : 
0.14: ironhorse oil and gas has denied to reuters that it is the
company in question.
0.12: oil officials hope to double iraq s output by the end of the
decade.
0.11: the winners of the contract are to build new flow lines and
build gas separation stations.
0.11: petrel said that it has not received any information from iraqi
authorities to confirm or deny the report.
0.11: a canadian company  named iog  is reported to have won the
contract to run the himrin field.
0.11: by 1700 gmt  petrel s shares fell from 97p ($1.87) to 44p
($0.85).
0.10: iraq is seeking to award contracts for three projects  valued at
$500m (£258.5m).
0.09: turkey s everasia is reported by reuters to have won a contract
to develop the khurmala dome field in the north of the country.
0.08: if iraq s cabinet approves the oil ministry s choice of
companies  then this will be the first deal that iraq has signed with
a foreign oil company.
0.08: the contract to develop the suba-luhais field has not

In [36]:
# using lang chain

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama
from langchain.chains import LLMChain

In [40]:
df = pd.read_csv("bbc-text.csv")

doc = df[df.category == "business"]["text"].sample(frac=0.2, random_state=42)

def wrap(x):

    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

text = doc.iloc[0]

if "\n" in text:

    content = text.split("\n",1)[1]

else:

    content = text

print("\nOriginal Content")
print(wrap(content))


Original Content
irish company hit by iraqi report shares in irish oil company petrel
resources have lost more than 50% of their value on a report that the
firm has failed to win a contract in iraq.  reuters news agency
reported that iraq s oil ministry has awarded the first post-war
oilfield contracts to a canadian and a turkish company.  by 1700 gmt
petrel s shares fell from 97p ($1.87) to 44p ($0.85). petrel said that
it has not received any information from iraqi authorities to confirm
or deny the report.  iraq is seeking to award contracts for three
projects  valued at $500m (£258.5m). turkey s everasia is reported by
reuters to have won a contract to develop the khurmala dome field in
the north of the country.  a canadian company  named iog  is reported
to have won the contract to run the himrin field.  ironhorse oil and
gas has denied to reuters that it is the company in question.  these
two projects aim to develop khurmala field to produce 100 000 barrels
per day and raise the

In [43]:
llm = Ollama(model='llama3.2')

prompt = PromptTemplate(
    input_variables=["article"],
    template=('Summerize the following sentence in 3-4 sentences:\n\n'
              '"{article}"\n\n'
              'Summary : '
))

In [44]:
chain = LLMChain(llm=llm, prompt=prompt, output_parser=StrOutputParser())

summary = chain.invoke({"article":content})

print("\nGenerated Summary : ")

final_summary = summary['text']

print(wrap(final_summary))


Generated Summary : 
Here is a summary of the sentence in 3-4 sentences:

Irish company
Petrel Resources has seen its shares lose over 50% of their value
after it was reported that the Iraqi government had awarded oilfield
contracts to Canadian and Turkish companies, bypassing local firms.
The news came as a shock to Petrel, which had not received any
information from the Iraqi authorities confirming or denying the
reports.  The two winning companies are Everasia (Turkey) and IOG
(Canada), who will develop the Khurmala Dome field in Iraq.  This move
is part of Iraq's efforts to boost its oil production capacity, aiming
to double output by the end of the decade.


In [45]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting chardet (from breadability>=0.1.20->sumy)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting lxml>=2.0 (from breadability>=0.1.20->sumy)
  Downloading lxml-5.4.0-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
Downloading lxml-5.4.0-cp311-cp311-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   -- -----

  DEPRECATION: Building 'docopt' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'docopt'. Discussion can be found at https://github.com/pypa/pip/issues/6334
  DEPRECATION: Building 'breadability' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'breadability'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [46]:
# Extractive summarization : 3rd technique - lex summarization

import pandas as pd
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

In [49]:
from xml.etree.ElementTree import parse
df = pd.read_csv("bbc-text.csv")

summarizer = LexRankSummarizer()

for idx, text in enumerate(df['text'].dropna()):

    print(f"Summary for Document{idx +1} : ")

    try:

        parser = PlaintextParser.from_string(str(text), Tokenizer("english"))

        summary = summarizer(parser.document, sentences_count=2)

        for sent in summary:

            print(str(sentence))

    except Exception as e:

        print(f"Error summarizing document {idx + 1}: {e}")

Summary for Document1 : 
 NLP and CV are two modern examples of AI
 NLP and CV are two modern examples of AI
Summary for Document2 : 
 NLP and CV are two modern examples of AI
 NLP and CV are two modern examples of AI
Summary for Document3 : 
 NLP and CV are two modern examples of AI
 NLP and CV are two modern examples of AI
Summary for Document4 : 
 NLP and CV are two modern examples of AI
 NLP and CV are two modern examples of AI
Summary for Document5 : 
 NLP and CV are two modern examples of AI
 NLP and CV are two modern examples of AI
Summary for Document6 : 
 NLP and CV are two modern examples of AI
 NLP and CV are two modern examples of AI
Summary for Document7 : 
 NLP and CV are two modern examples of AI
 NLP and CV are two modern examples of AI
Summary for Document8 : 
 NLP and CV are two modern examples of AI
 NLP and CV are two modern examples of AI
Summary for Document9 : 
 NLP and CV are two modern examples of AI
 NLP and CV are two modern examples of AI
Summary for Documen

In [50]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [52]:
!pip install tf-keras


Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------ --------------------------- 0.5/1.7 MB 3.4 MB/s eta 0:00:01
   ------------------ --------------------- 0.8/1.7 MB 2.4 MB/s eta 0:00:01
   ------------------------------ --------- 1.3/1.7 MB 2.0 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 2.3 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.19.0


In [55]:
# Abstractive summerization using transformer

from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

df = pd.read_csv("bbc-text.csv")

for idx, text in enumerate(df['text'].dropna()):

    print(f"Abstractive summary for Document{idx +1} : ")

    try:

        text = str(text)

        if len(text) > 1024:

            text = text[:1024]

        summary = summarizer(text, max_length=100, min_length=30, do_sample=False)

        print(summary[0]['summary_text'])

    except Exception as e:

        print(f"Error summarizing document {idx + 1}: {e}")

Device set to use cpu


Abstractive summary for Document1 : 
The way people watch tv will be radically different in five years time. This is according to an expert panel which gathered at the annual consumer electronics show in las vegas.
Abstractive summary for Document2 : 
Former worldcom boss never made accounting decisions, witness says. David myers made the comments under questioning by defence lawyers. Prosecutors claim losses were hidden to protect the firm s shares.
Abstractive summary for Document3 : 
 leicester and saracens are believed to head the list of rugby union clubs interested in signing farrell if he decides to move to the 15-man game. tigers boss john wells believes he would better off playing in the backs  at least initially.
Abstractive summary for Document4 : 
 premiership side newcastle united face a trip to ryman premier league leaders yeading in the fa cup third round. conference side exeter city  will travel to old trafford to meet holders manchester united in january. arsenal were 

KeyboardInterrupt: 

In [None]:
# Pragmatics-aware (context aware) chat-bot

from transformers import pipeline

classifier = pipeline("text-classification", model = "distilbert-base-uncased-finetuned-sst-2-english")

def chatbot(user_input, context):

    result = classifier(user_input)

    print(result)

    intent = result[0]['label']

    if intent == "POSITIVE":

        if 'cold' in user_input.lower():

            return "It seems you like the cold! Would you like a blanket ?"

        else:

            return "I'm glad to hear that! How can I assist you?"

    elif intent == "NEGATIVE":

        if 'cold' in user_input.lower():

            return "I'm sorry to hear that! Would you like me to adjust the temperature ?"

        else:

            return "I am hear to help if somemthing is bothering you"
    else:

        return "Thanks for sharing. Could you tell me more"


context = {}

user_inputs = [

    "It's so cold in here!",

    "I love this weather!",

    "I'm not feeling great about today.",

    "Do you have a hot drink?",

    "Oh, you do not have cold drink?",

    "It is neither too hot nor too cold today",

    "It is just too hot!!!",

    "I do not like hot tempered people!"]

for user_input in user_inputs:

    response = chatbot(user_input, context)

    print(f"User : {user_input}")

    print(f"Bot : {response}")

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.9992741942405701}]
User : It's so cold in here!
Bot : I'm sorry to hear that! Would you like me to adjust the temperature ?
[{'label': 'POSITIVE', 'score': 0.9998793601989746}]
User : I love this weather!
Bot : I'm glad to hear that! How can I assist you?
[{'label': 'NEGATIVE', 'score': 0.999728262424469}]
User : I'm not feeling great about today.
Bot : I am hear to help if somemthing is bothering you
[{'label': 'POSITIVE', 'score': 0.5948962569236755}]
User : Do you have a hot drink?
Bot : I'm glad to hear that! How can I assist you?
[{'label': 'POSITIVE', 'score': 0.9986951947212219}]
User : Oh, you do not have cold drink?
Bot : It seems you like the cold! Would you like a blanket ?
[{'label': 'POSITIVE', 'score': 0.6591268181800842}]
User : It is neither too hot nor too cold today
Bot : It seems you like the cold! Would you like a blanket ?
[{'label': 'NEGATIVE', 'score': 0.9524922966957092}]
User : It is just too hot!!!
Bot : I am hear to help if s