In [1]:
import spacy

In [2]:
file_path = 'scraped_articles.txt'  

with open(file_path, 'r', encoding='utf-8') as file:
    articles = file.read()
print(articles)

File 
Kathmandu, January 26
Santoshi Shrestha has won a gold medal with a national record in the Bangabandhu Sheikh Mujib Dhaka Marathon held in Dhaka, Bangladesh. She completed the prescribed distance in 2 hours 46 minutes 22 seconds and won gold.
She has previously broken the record of 2 hours 48 minutes 02 seconds held by Nepal’s Pushpa Bhandari and set a new record. This competition is the debut marathon of Shrestha, who has been running in middle distance before.
Shrestha is the gold winner of the 13th South Asian Games (SAG) in athletics 10,000 meters race held in Kathmandu four years ago.
Bhandari came second in the Bangabandhu Sheikh Mujib Dhaka Marathon. She completed it in 2 hours 49 minutes 34 seconds. She is the gold winner of the last edition of the Dhaka Marathon.
Similarly, Khadag Bahadur Khadka came third in the men’s competition. He took 2 hours 18 minutes 32 seconds to complete.
On the women’s side, Bindradhanke Shrestha finished fifth. She took 3 hours and 4 minutes.

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
doc = nlp(articles)
sentences = [sent.text for sent in doc.sents]

In [6]:
for sentence in sentences:
    print(sentence)

File 
Kathmandu, January 26
Santoshi Shrestha has won a gold medal with a national record in the Bangabandhu Sheikh Mujib Dhaka Marathon held in Dhaka, Bangladesh.
She completed the prescribed distance in 2 hours 46 minutes 22 seconds and won gold.

She has previously broken the record of 2 hours 48 minutes 02 seconds held by Nepal’s Pushpa Bhandari and set a new record.
This competition is the debut marathon of Shrestha, who has been running in middle distance before.

Shrestha is the gold winner of the 13th South Asian Games (SAG) in athletics 10,000 meters race held in Kathmandu four years ago.

Bhandari came second in the Bangabandhu Sheikh Mujib Dhaka Marathon.
She completed it in 2 hours 49 minutes 34 seconds.
She is the gold winner of the last edition of the Dhaka Marathon.

Similarly, Khadag Bahadur Khadka came third in the men’s competition.
He took 2 hours 18 minutes 32 seconds to complete.

On the women’s side, Bindradhanke Shrestha finished fifth.
She took 3 hours and 4 min

In [9]:
processed_data = []
for tokenized_sentence in sentences:
    doc = nlp(tokenized_sentence)
    subjects = []
    objects = []
    relationships = []

    for token in doc:
        if token.dep_ in ("nsubj", "nsubjpass"):
            subjects.append((token.text, token.head.text, token.dep_))
        elif token.dep_ in ("dobj", "attr"):
            objects.append((token.text, token.head.text, token.dep_))
        elif token.dep_ == "ROOT" and token.pos_ == "VERB":
            relationships.append(token.text)

    processed_data.append({
        "sentence": tokenized_sentence,
        "subjects": subjects,
        "objects": objects,
        "relationships": relationships
    })

In [10]:
for data in processed_data:
    print("Sentence:", data["sentence"])
    
    if "subjects" in data:
        print("Subjects:", data["subjects"])
    else:
        print("No subjects found.")

    if "objects" in data:
        print("Objects:", data["objects"])
    else:
        print("No objects found.")

    if "relationships" in data:
        print("Relationships:", data["relationships"])
    else:
        print("No relationships found.")
    
    print()

Sentence: File 
Kathmandu, January 26
Santoshi Shrestha has won a gold medal with a national record in the Bangabandhu Sheikh Mujib Dhaka Marathon held in Dhaka, Bangladesh.
Subjects: [('Kathmandu', 'won', 'nsubj'), ('Shrestha', 'won', 'nsubj')]
Objects: [('medal', 'won', 'dobj')]
Relationships: ['won']

Sentence: She completed the prescribed distance in 2 hours 46 minutes 22 seconds and won gold.

Subjects: [('She', 'completed', 'nsubj')]
Objects: [('distance', 'completed', 'dobj'), ('gold', 'won', 'dobj')]
Relationships: ['completed']

Sentence: She has previously broken the record of 2 hours 48 minutes 02 seconds held by Nepal’s Pushpa Bhandari and set a new record.
Subjects: [('She', 'broken', 'nsubj')]
Objects: [('record', 'broken', 'dobj'), ('record', 'set', 'dobj')]
Relationships: ['broken']

Sentence: This competition is the debut marathon of Shrestha, who has been running in middle distance before.

Subjects: [('competition', 'is', 'nsubj'), ('who', 'running', 'nsubj')]
Object

In [26]:
import networkx as nx

G = nx.DiGraph()

for data in processed_data:
    subjects = data["subjects"]
    objects = data["objects"]
    relationships = data["relationships"]

    for subject in subjects:
        G.add_node(subject[0], label=subject[0], type='subject')

    for obj in objects:
        G.add_node(obj[0], label=obj[0], type='object')

    for relationship in relationships:
        for subject in subjects:
            for obj in objects:
                G.add_edge(subject[0], obj[0], label=relationship)


In [14]:
nx.write_gexf(G, "graph.gexf")

In [31]:
question = "Who won a gold medal in the Bangabandhu Sheikh Mujib Dhaka Marathon?"

# Tokenize and process the question
question_doc = nlp(question)

question_subjects = [token.text.lower() for token in question_doc if token.dep_ in ("nsubj", "nsubjpass")]

# Extract relevant information from the question
if question_subjects:
    question_subject = question_subjects[0]
    
    matching_nodes = [node for node, data in G.nodes(data=True) if question_subject in node.lower()]
    
    if matching_nodes:
        for node in matching_nodes:
            out_edges = G.out_edges(node, data=True)
            for _, target, data in out_edges:
                print("Answer:", (node, target, data['label']))
    else:
        print(f"No matching nodes found in the graph for subject: {question_subject}")
else:
    print("Unable to extract subject from the question.")

Answer: ('who', 'what', 'wants')
Answer: ('who', 'scene', 'wants')
Answer: ('who', 'game', 'made')
Answer: ('who', 'it', 'decided')
Answer: ('who', 'many', 'decided')
Answer: ('who', 'dreams', 'looks')
Answer: ('who', 'team', 'looks')
Answer: ('who', 'goal', 'made')
Answer: ('who', 'saves', 'made')
Answer: ('who', 'Dhangadhi', 'made')
Answer: ('who', 'pace', 'made')
Answer: ('who', 'players', 'adds')
Answer: ('who', 'website', 'visit')
Answer: ('who', 'champions', 'prove')
Answer: ('who', 'test', 'prove')
Answer: ('who', 'finish', 'prove')
Answer: ('who', 'playoffs', 'clinched')
Answer: ('who', 'spot', 'clinched')
Answer: ('who', 'FC', 'beat')
Answer: ('who', 'run', 'beat')
Answer: ('who', 'table', 'blew')
Answer: ('who', 'top', 'determine')
Answer: ('who', 'curriculum', 'reverberates')
Answer: ('who', 'thinking', 'reverberates')
Answer: ('who', 'glow', 'enjoying')
Answer: ('who', 'time', 'enjoying')
Answer: ('who', 'experts', 'says')
Answer: ('who', 'cadres', 'says')
Answer: ('who', '