In [1]:
import requests

def annotate_text_with_dbpedia(text):
    spotlight_url = "http://api.dbpedia-spotlight.org/en/annotate"
    headers = {'Accept': 'application/json'}
    params = {'text': text, 'confidence': 0.5}
    response = requests.get(spotlight_url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Example usage
text = "Barack Obama was the president of the United States."
annotations = annotate_text_with_dbpedia(text)
print(annotations)

{'@text': 'Barack Obama was the president of the United States.', '@confidence': '0.5', '@support': '0', '@types': '', '@sparql': '', '@policy': 'whitelist', 'Resources': [{'@URI': 'http://dbpedia.org/resource/Barack_Obama', '@support': '27440', '@types': 'Http://xmlns.com/foaf/0.1/Person,Wikidata:Q82955,Wikidata:Q5,Wikidata:Q24229398,Wikidata:Q215627,DUL:NaturalPerson,DUL:Agent,Schema:Person,DBpedia:Person,DBpedia:Agent,DBpedia:Politician', '@surfaceForm': 'Barack Obama', '@offset': '0', '@similarityScore': '0.9999750765811097', '@percentageOfSecondRank': '1.2731717656137947E-5'}, {'@URI': 'http://dbpedia.org/resource/United_States', '@support': '553243', '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country', '@surfaceForm': 'United States', '@offset': '38', '@similarityScore': '0.9999326809092367', '@percentageOfSecondRank': '4.953675667499674E-5'}]}


In [2]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def calculate_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2)

# Example usage
similarity = calculate_similarity("fast car", "speedy vehicle")
print(similarity)


0.7017164834136657


  return doc1.similarity(doc2)


In [3]:
# Filter Relations & Match Node Name to Entity

import pandas as pd
from textblob import TextBlob
import re

In [4]:
input_path = "../data_intermediate"
output_path = "../data_intermediate"

raw_relations = pd.read_csv(input_path + "/raw_relations.csv", index_col = 0)
entities = pd.read_csv(input_path + "/entity_list.csv", index_col = 0)

In [5]:
# Create entity list
entity_set = set(list(entities['ent_name']))

# Add fixed key words as entities
fixed_set = ["lgbt", 'dei', 'diversity']
for ent in fixed_set:
    entity_set.add(ent)

In [6]:
# First transformation: extract nouns from subject, object
def extract_nouns(text):
    blob = TextBlob(text)
    return blob.noun_phrases

raw_relations['node1_t1'] = raw_relations.apply(lambda x: extract_nouns(x['subject']), axis = 1)
raw_relations['node2_t1'] = raw_relations.apply(lambda x: extract_nouns(x['object']), axis = 1)
raw_relations[:10]

Unnamed: 0,news_id,subject,relation,object,node1_t1,node2_t1
0,0,Pope Francis,Calls,Deplorable,[pope francis],[]
1,0,Catholic leader ’s stance,is,likely,[catholic leader ’ s stance],[]
2,0,many members,have relied on,practice,[],[]
3,0,members,have,their children,[],[]
4,0,leader ’s stance,roil,LGBT community,[leader ’ s stance],[lgbt]
5,0,Catholic leader,’s,stance on surrogacy,[catholic leader],[]
6,0,members,have,their own children,[],[own children]
7,0,leader ’s stance,is,likely,[leader ’ s stance],[]
8,0,members,have relied on,practice,[],[]
9,0,many members,have,their own children,[],[own children]


In [7]:
import requests

def get_dbpedia_resource(text):
    url = "http://api.dbpedia-spotlight.org/en/annotate"
    headers = {'Accept': 'application/json'}
    params = {'text': text, 'confidence': 0.5}  # For now we use confidence level > 0.5
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            resources = data.get('Resources', [])
            if resources:
                return resources[0]['@URI']  # Return the URI of the first resource
    except Exception as e:
        print(f"Error querying DBpedia Spotlight: {e}")
    return None

In [8]:
# Second transformation: match node name to entity list
def match_node_entity(node_name, node_nouns):
    """
    Case1: Node name is in the entity list. 
        Use the node name. For instance, if "LGBT Community" is in the entity list, use it directly.
    
    Case2: Node name is not in the entity list, but the noun list created from the previous step is not empty.
        Select the first noun from the noun list and find relevant entity for the selected noun.

        Case2-a: if a relevant entity is found, use the relevant entity as new node name. For instance, if the noun list is [france], relevant node is "France".

        Case2-b: if no relevant entity is found, use the select noun as new node name. For instance, if the noun list is [leader's stance], no relevant node is found. Therefore, "leader's stance" is used as the new node.
    
    Case3: If node name is not in the entity list and noun list is empty
        Return None
    """
    if node_name in entity_set:
        return node_name
    if len(node_nouns) > 0:
        node_name = node_nouns[0]
        return find_relevant_entity(node_name)
    return None

In [9]:
def find_relevant_entity(node_name):
    """
    Find the most relevant entity for the node name.
    Relevance is determined by the number of matched words.
    """
    entity_list = list(entity_set)
    if node_name in entity_set:
        return node_name
    match_perc = 0
    best_match = None
    node_words = node_name.lower().split()
    for ent in entity_list:
        ent_words = ent.lower().split()
        match_cnt = 0
        for w1 in node_words:
            for w2 in ent_words:
                if w1 == w2:
                    match_cnt += 1
        if (match_cnt/len(ent_words) >= 0.5) and (match_cnt/len(node_words) >= 0.5) and (match_cnt/len(ent_words) > match_perc):
            match_perc = match_cnt/len(ent_words)
            best_match = ent
    
    if best_match is not None:
        return best_match
    node_name = re.sub(r"’ s", '', node_name)
    node_name = re.sub(r"'s", '', node_name)
    return node_name

raw_relations['node1_t2'] = raw_relations.apply(lambda x: match_node_entity(x['subject'], x['node1_t1']), axis = 1)
raw_relations['node2_t2'] = raw_relations.apply(lambda x: match_node_entity(x['object'], x['node2_t1']), axis = 1)

# Select relations if both nodes are not None
filtered_relations = raw_relations[(raw_relations['node1_t2'].notnull()) & (raw_relations['node2_t2'].notnull())]
filtered_relations.head()

Unnamed: 0,news_id,subject,relation,object,node1_t1,node2_t1,node1_t2,node2_t2
4,0,leader ’s stance,roil,LGBT community,[leader ’ s stance],[lgbt],leader stance,LGBT community
10,0,Catholic leader ’s stance,roil,LGBT community,[catholic leader ’ s stance],[lgbt],catholic leader stance,LGBT community
12,1,France,Has,Historic New Prime Minister,[france],"[historic, prime]",France,historic
13,1,France,Has,Historic Prime Minister,[france],[historic prime],France,historic prime
14,1,France,Has,New Prime Minister,[france],[prime],France,prime


In [22]:
# Third transformation: deal with nodes with similar names within one news. For instance, "leader's stance" and "catholic leader's stance"

node_set = set(filtered_relations['node1_t2']).union(set(filtered_relations['node2_t2']))
node_list = sorted(list(node_set), key = lambda x: len(x))

def similar_node(node1, node2):
    """
    Determine if two nodes are similar.
    """
    node1_wl = node1.lower().split()
    node2_wl = node2.lower().split()
    matched_words = len(set(node1_wl) & set(node2_wl))
    word_match_ratio = matched_words / max(len(node1_wl), len(node2_wl))
    return word_match_ratio >= 0.5

final_node_list = list()
transformation_map = {}

for node in node_list:
    if node in entity_set:
        final_node_list.append(node)

for node in node_list:
    node_flag = False
    if node in entity_set:
        continue
    for fn in final_node_list:
        if similar_node(node, fn):
            transformation_map[node] = fn
            node_flag = True
        if node_flag:
            break
    if not node_flag: 
        final_node_list.append(node)

def transform_node(node_name):
    if node_name in transformation_map:
        return transformation_map[node_name].title()
    return node_name.title()

filtered_relations['node1_t3'] = filtered_relations.apply(lambda x: transform_node(x['node1_t2']), axis = 1)
filtered_relations['node2_t3'] = filtered_relations.apply(lambda x: transform_node(x['node2_t2']), axis = 1)
filtered_relations[:50]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_relations['node1_t3'] = filtered_relations.apply(lambda x: transform_node(x['node1_t2']), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_relations['node2_t3'] = filtered_relations.apply(lambda x: transform_node(x['node2_t2']), axis = 1)


Unnamed: 0,news_id,subject,relation,object,node1_t1,node2_t1,node1_t2,node2_t2,node1_t3,node2_t3
4,0,leader ’s stance,roil,LGBT community,[leader ’ s stance],[lgbt],leader stance,LGBT community,Leader Stance,Lgbt Community
10,0,Catholic leader ’s stance,roil,LGBT community,[catholic leader ’ s stance],[lgbt],catholic leader stance,LGBT community,Leader Stance,Lgbt Community
12,1,France,Has,Historic New Prime Minister,[france],"[historic, prime]",France,historic,France,Historic
13,1,France,Has,Historic Prime Minister,[france],[historic prime],France,historic prime,France,Prime
14,1,France,Has,New Prime Minister,[france],[prime],France,prime,France,Prime
15,1,France,Has,Prime Minister,[france],[prime],France,prime,France,Prime
18,2,Ohio House votes,override,sports ban,"[ohio, house votes]",[sports ban],Ohio,sports ban,Ohio,Sports Ban
19,2,Ohio House votes,override,governor 's veto,"[ohio, house votes]",[governor 's veto],Ohio,governor veto,Ohio,Governor Veto
20,2,Ohio House votes,override sports ban on,trans care,"[ohio, house votes]",[trans care],Ohio,trans care,Ohio,Trans
21,2,Ohio House,override,Gov. Mike DeWine ’s veto,[ohio],"[gov, mike dewine, ’ s veto]",Ohio,gov,Ohio,Gov


In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")

def semantic_similarity(text1, text2):
    # Ensure inputs are strings
    doc1 = nlp(str(text1))
    doc2 = nlp(str(text2))
    return doc1.similarity(doc2)

In [24]:
# Third transformation: deal with nodes with similar names within one news. For instance, "leader's stance" and "catholic leader's stance"

node_set = set(filtered_relations['node1_t2']).union(set(filtered_relations['node2_t2']))
node_list = sorted(list(node_set), key = lambda x: len(x))

def similar_node(node1, node2):
    """
    Determine if two nodes are similar, using both word match and semantic similarity.
    """
    # Word match calculation
    node1_wl = node1.lower().split()
    node2_wl = node2.lower().split()
    matched_words = len(set(node1_wl) & set(node2_wl))
    word_match_ratio = matched_words / max(len(node1_wl), len(node2_wl))
    
    # Semantic similarity calculation
    semantic_similarity_score = semantic_similarity(node1, node2)

    # Compare results
    word_match_result = word_match_ratio >= 0.5
    semantic_similarity_result = semantic_similarity_score >= 0.8
    
    # Return True if either method indicates similarity
    return word_match_result or semantic_similarity_result

final_node_list = list()
transformation_map = {}

for node in node_list:
    if node in entity_set:
        final_node_list.append(node)

for node in node_list:
    node_flag = False
    if node in entity_set:
        continue
    for fn in final_node_list:
        if similar_node(node, fn):
            transformation_map[node] = fn
            node_flag = True
        if node_flag:
            break
    if not node_flag: 
        final_node_list.append(node)

def transform_node(node_name):
    if node_name in transformation_map:
        return transformation_map[node_name].title()
    return node_name.title()

filtered_relations['node1_t3'] = filtered_relations.apply(lambda x: transform_node(x['node1_t2']), axis = 1)
filtered_relations['node2_t3'] = filtered_relations.apply(lambda x: transform_node(x['node2_t2']), axis = 1)
filtered_relations[:50]

  return doc1.similarity(doc2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_relations['node1_t3'] = filtered_relations.apply(lambda x: transform_node(x['node1_t2']), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_relations['node2_t3'] = filtered_relations.apply(lambda x: transform_node(x['node2_t2']), axis = 1)


Unnamed: 0,news_id,subject,relation,object,node1_t1,node2_t1,node1_t2,node2_t2,node1_t3,node2_t3
4,0,leader ’s stance,roil,LGBT community,[leader ’ s stance],[lgbt],leader stance,LGBT community,Governor Veto,Lgbt Community
10,0,Catholic leader ’s stance,roil,LGBT community,[catholic leader ’ s stance],[lgbt],catholic leader stance,LGBT community,Catholic Leader Stance,Lgbt Community
12,1,France,Has,Historic New Prime Minister,[france],"[historic, prime]",France,historic,France,Historic
13,1,France,Has,Historic Prime Minister,[france],[historic prime],France,historic prime,France,Prime
14,1,France,Has,New Prime Minister,[france],[prime],France,prime,France,Prime
15,1,France,Has,Prime Minister,[france],[prime],France,prime,France,Prime
18,2,Ohio House votes,override,sports ban,"[ohio, house votes]",[sports ban],Ohio,sports ban,Ohio,Sports Ban
19,2,Ohio House votes,override,governor 's veto,"[ohio, house votes]",[governor 's veto],Ohio,governor veto,Ohio,Governor Veto
20,2,Ohio House votes,override sports ban on,trans care,"[ohio, house votes]",[trans care],Ohio,trans care,Ohio,Trans
21,2,Ohio House,override,Gov. Mike DeWine ’s veto,[ohio],"[gov, mike dewine, ’ s veto]",Ohio,gov,Ohio,Gov


In [11]:
# Forth transformation: drop duplicate relations within one news

transformed_relations = filtered_relations.drop_duplicates(subset = ['news_id', 'node1_t3', 'node2_t3'], keep ='first')
transformed_relations.head()
# transformed_relations.to_csv(output_path + "/transformed_relations.csv")

Unnamed: 0,news_id,subject,relation,object,node1_t1,node2_t1,node1_t2,node2_t2,node1_t3,node2_t3
4,0,leader ’s stance,roil,LGBT community,[leader ’ s stance],[lgbt],leader stance,LGBT community,Leader Stance,Lgbt Community
12,1,France,Has,Historic New Prime Minister,[france],"[historic, prime]",France,historic,France,Historic
13,1,France,Has,Historic Prime Minister,[france],[historic prime],France,historic prime,France,Prime
18,2,Ohio House votes,override,sports ban,"[ohio, house votes]",[sports ban],Ohio,sports ban,Ohio,Sports Ban
19,2,Ohio House votes,override,governor 's veto,"[ohio, house votes]",[governor 's veto],Ohio,governor veto,Ohio,Governor Veto
