In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from konlpy.tag import Okt
import re
import os
import networkx as nx

In [None]:
# 1. Load the TSV file
tsv_file_path = "C:\\Users\\WINDOWS11\\Desktop\\kpop_agenda\\Step1\\articles_metadata.tsv"  # Use raw string or escape backslashes
df = pd.read_csv(tsv_file_path, sep='\t', encoding='utf-8')

In [None]:
# 2. Load articles and preprocess with KoNLPy for NER (Person entities) and basic cleaning
articles_ner_info = {} # Store NER info for each article, including original text
article_ids = []    # Store corresponding IDs
okt = Okt()

def extract_ner_information(article_content):
    """
    Extracts Named Entities (specifically PERSONs) using KoNLPy Okt and basic rule-based approach.
    Returns PERSON entities and cleaned article text (nouns only).
    Relationship keyword extraction is removed.
    """
    pos_tags = okt.pos(article_content, norm=True, stem=True)

    person_entities = set()
    cleaned_tokens = []

    for token, tag in pos_tags:
        if tag == "Noun":
            cleaned_tokens.append(token)
            # Heuristic for Person names
            if tag == "Noun" and len(token) > 1 : # names are nouns (usually) and longer than 1 character (limits)
                person_entities.add(token) # Basic person entity extraction - can be refined


    cleaned_article_text = " ".join(cleaned_tokens) # Just nouns for embedding (you can adjust this)
    return {
        "original_text": article_content, # Keep original for later if needed
        "person_entities": person_entities,
        "cleaned_article_text": cleaned_article_text # Noun tokenized text
    }

for index, row in df.iterrows():
    file_path = row['file_path']
    article_id = row['ID']
    full_file_path = os.path.join(os.path.dirname(tsv_file_path), file_path) # Construct full file path
    try:
        with open(full_file_path, 'r', encoding='utf-8') as f:
            article_content = f.read()
            ner_info = extract_ner_information(article_content) # Extract NER info and cleaned text
            articles_ner_info[article_id] = ner_info
            article_ids.append(article_id)
    except FileNotFoundError:
        print(f"Warning: File not found: {full_file_path}")
        articles_ner_info[article_id] = {"person_entities": set(), "cleaned_article_text": ""} # Empty info for missing files
        article_ids.append(article_id)

In [None]:
# 3. Sentence Embeddings (using cleaned noun-based text)
model = SentenceTransformer('all-mpnet-base-v2') # all-mpnet-base-v2 for quality performance
cleaned_articles_for_embedding = [articles_ner_info[id_]['cleaned_article_text'] for id_ in article_ids] # Use cleaned noun text for embedding
embeddings = model.encode(cleaned_articles_for_embedding)

In [None]:
# 4. Calculate Similarity Matrix (MOVED HERE)
similarity_matrix = cosine_similarity(embeddings) 

In [None]:
def calculate_ner_similarity(article_id1, article_id2, embedding_similarity):
    ner_info1 = articles_ner_info[article_id1]
    ner_info2 = articles_ner_info[article_id2]

    person_entity_overlap = len(ner_info1["person_entities"].intersection(ner_info2["person_entities"]))

    person_entity_diff_penalty = 0
    if ner_info1["person_entities"] and ner_info2["person_entities"]:
        person_entity_diff_ratio = 1 - (person_entity_overlap / min(len(ner_info1["person_entities"]), len(ner_info2["person_entities"])))
        person_entity_diff_penalty = person_entity_diff_ratio * 0.93

    refined_similarity = embedding_similarity - person_entity_diff_penalty  # Removed relationship keyword bonus

    return max(0, refined_similarity)

In [None]:
# 7. Graph-Based Clustering with NER-aware Similarity
threshold = 0.509  # Adjust threshold - might need to be lower as we are refining similarity
graph = nx.Graph()

for i in range(len(article_ids)):
    id1 = article_ids[i]
    graph.add_node(id1)
    for j in range(i + 1, len(article_ids)):
        id2 = article_ids[j]
        embedding_similarity = similarity_matrix[i, j] # Get embedding similarity
        refined_similarity = calculate_ner_similarity(id1, id2, embedding_similarity) # Calculate NER-aware similarity

        if refined_similarity > threshold:
            graph.add_edge(id1, id2, weight=refined_similarity) # You can store similarity as edge weight

connected_components = list(nx.connected_components(graph))

unique_ids = {}
next_unique_id = 1

for component in connected_components:
    for article_id in component:
        unique_ids[article_id] = next_unique_id
    next_unique_id += 1


In [None]:
# 8. Add "unique_article_ID" column to DataFrame
df['unique_article_ID'] = df['ID'].map(unique_ids) # efficient way to add the column

In [None]:
# 9. Save the updated TSV file
output_tsv_file = "C:\\Users\\WINDOWS11\\Desktop\\kpop_agenda\\Step1\\articles_metadata_deduplicated.tsv" # Change the output path
df.to_csv(output_tsv_file, sep='\t', encoding='utf-8', index=False)

print(f"Deduplicated TSV file saved to: {output_tsv_file}")