# Note

JDK is required to run this code, as it is required for `konlpy`.

In [6]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from konlpy.tag import Okt
import re
import os
import networkx as nx

In [12]:
csv_path = "C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/articles_metadata.csv"
articles_dir = "C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/Articles"

In [13]:
df = pd.read_csv(csv_path, encoding='utf-8')

In [16]:
okt = Okt()

articles_ner_info = {}
article_ids = []

def extract_ner_information(article_content):
    pos_tags = okt.pos(article_content, norm=True, stem=True)
    person_entities = set()
    cleaned_tokens = []
    for token, tag in pos_tags:
        if tag == "Noun":
            cleaned_tokens.append(token)
            if len(token) > 1:
                person_entities.add(token)
    return {
        "person_entities": person_entities,
        "cleaned_article_text": " ".join(cleaned_tokens)
    }

print(f"Looking for files in: {articles_dir}")

for index, row in df.iterrows():

    raw_filename = str(row['file_path'])
    clean_filename = os.path.basename(raw_filename)
    
    article_id = row['ID']
    full_file_path = os.path.join(articles_dir, clean_filename)

    try:
        with open(full_file_path, 'r', encoding='utf-8') as f:
            article_content = f.read()
            ner_info = extract_ner_information(article_content)
            articles_ner_info[article_id] = ner_info
            article_ids.append(article_id)
            
    except FileNotFoundError:
        print(f"FAILED to find: {full_file_path}")
        articles_ner_info[article_id] = {"person_entities": set(), "cleaned_article_text": ""}
        article_ids.append(article_id)

print("Processing complete.")

Looking for files in: C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/Articles
FAILED to find: C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/Articles\20240625_article_5.txt
FAILED to find: C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/Articles\20240729_article_5.txt
FAILED to find: C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/Articles\20240803_article_10.txt
FAILED to find: C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/Articles\20241015_article_10.txt
FAILED to find: C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/Articles\20241223_article_4.txt
Processing complete.


In [17]:
# Sentence Embeddings (using cleaned noun-based text)
model = SentenceTransformer('all-mpnet-base-v2') # all-mpnet-base-v2 for quality performance
cleaned_articles_for_embedding = [articles_ner_info[id_]['cleaned_article_text'] for id_ in article_ids] # Use cleaned noun text for embedding
embeddings = model.encode(cleaned_articles_for_embedding)

In [18]:
# Calculate Similarity Matrix
similarity_matrix = cosine_similarity(embeddings) 

In [19]:
def calculate_ner_similarity(article_id1, article_id2, embedding_similarity):
    ner_info1 = articles_ner_info[article_id1]
    ner_info2 = articles_ner_info[article_id2]

    person_entity_overlap = len(ner_info1["person_entities"].intersection(ner_info2["person_entities"]))

    person_entity_diff_penalty = 0
    if ner_info1["person_entities"] and ner_info2["person_entities"]:
        person_entity_diff_ratio = 1 - (person_entity_overlap / min(len(ner_info1["person_entities"]), len(ner_info2["person_entities"])))
        person_entity_diff_penalty = person_entity_diff_ratio * 0.93

    refined_similarity = embedding_similarity - person_entity_diff_penalty  # Removed relationship keyword bonus

    return max(0, refined_similarity)

In [20]:
# 7. Graph-Based Clustering with NER-aware Similarity
threshold = 0.509  # Adjust threshold - might need to be lower as we are refining similarity
graph = nx.Graph()

for i in range(len(article_ids)):
    id1 = article_ids[i]
    graph.add_node(id1)
    for j in range(i + 1, len(article_ids)):
        id2 = article_ids[j]
        embedding_similarity = similarity_matrix[i, j] # Get embedding similarity
        refined_similarity = calculate_ner_similarity(id1, id2, embedding_similarity) # Calculate NER-aware similarity

        if refined_similarity > threshold:
            graph.add_edge(id1, id2, weight=refined_similarity) # You can store similarity as edge weight

connected_components = list(nx.connected_components(graph))

unique_ids = {}
next_unique_id = 1

for component in connected_components:
    for article_id in component:
        unique_ids[article_id] = next_unique_id
    next_unique_id += 1


In [21]:
# 8. Add "unique_article_ID" column to DataFrame
df['unique_article_ID'] = df['ID'].map(unique_ids) # efficient way to add the column

In [22]:
# 9. Save the updated CSV file
output_csv_file = "C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/articles_metadata_deduplicated.csv" # Change the output path
df.to_csv(output_csv_file, encoding='utf-8', index=False)

print(f"Deduplicated CSV file saved to: {output_csv_file}")

Deduplicated CSV file saved to: C:/Users/WINDOWS 11/Desktop/kpop_agenda/Step1/articles_metadata_deduplicated.csv


In [23]:
# Group by the unique ID to find clusters of similar articles
clusters = df.groupby('unique_article_ID')

print("--- Similar Content Clusters ---")
found_duplicates = False

for cluster_id, group in clusters:
    # If the group has more than 1 entry, it means they are considered 'same content'
    if len(group) > 1:
        found_duplicates = True
        print(f"\nCluster {cluster_id}:")
        for idx, row in group.iterrows():
            # Prints the ID and the filename for comparison
            print(f" - [ID: {row['ID']}] {row['file_path']}")

if not found_duplicates:
    print("No duplicate clusters found with the current threshold.")

--- Similar Content Clusters ---

Cluster 7:
 - [ID: 7] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240101_article_7.txt
 - [ID: 20] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240102_article_10.txt

Cluster 11:
 - [ID: 11] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240102_article_1.txt
 - [ID: 14] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240102_article_4.txt
 - [ID: 35] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240104_article_5.txt
 - [ID: 37] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240104_article_7.txt
 - [ID: 41] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240105_article_1.txt
 - [ID: 43] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240105_article_3.txt
 - [ID: 2204] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240808_article_4.txt
 - [ID: 2207] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240808_article_7.txt
 - [ID: 2208] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240808_article_8.txt
 - [ID: 2221] C:/Users/WINDOWS 11/Desktop/kpop_agenda/20240810_article_1.txt

Cluster 15:
 - [ID: 16] C:/Users/