In [None]:
import os
import json
import pandas as pd
from glob import glob

def merge_json_to_csv(metadata_folder='/home/lenovo3/Desktop/Alvin/NUS_ISS/PaperMatch/datasets/metadata/', output_csv='/home/lenovo3/Desktop/Alvin/NUS_ISS/PaperMatch/datasets/metadata/metadata.csv'):
    all_data = []

    # Get all JSON files in the metadata folder
    json_files = glob(os.path.join(metadata_folder, '*.json'))

    for json_file in json_files:
        with open(json_file, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    all_data.extend(data)
                elif isinstance(data, dict):
                    all_data.append(data)
            except json.JSONDecodeError as e:
                print(f"Error reading {json_file}: {e}")

    if not all_data:
        print("No valid data found in the JSON files.")
        return

    # Convert to DataFrame and save as CSV
    df = pd.DataFrame(all_data)
    df.to_csv(output_csv, index=False)
    print(f"Metadata saved to {output_csv}")

# Example usage
merge_json_to_csv()

In [None]:
import os
import pandas as pd

def generate_cluster_csvs(metadata_csv='/home/lenovo3/Desktop/Alvin/NUS_ISS/PaperMatch/datasets/academic_metadata.csv', parent_folder='/home/lenovo3/Desktop/Alvin/NUS_ISS/PaperMatch/datasets/Academic_Clusters/Organised_Crime_and_Drug_Trafficking/'):
    # Load the metadata CSV
    metadata_df = pd.read_csv(metadata_csv)

    # Ensure filename column exists
    if 'filename' not in metadata_df.columns:
        raise ValueError("'filename' column not found in the metadata CSV.")

    # Go through each subfolder (cluster) in parent folder
    for subdir in os.listdir(parent_folder):
        subdir_path = os.path.join(parent_folder, subdir)
        if os.path.isdir(subdir_path):
            # Get all filenames in the subfolder
            subfolder_filenames = set(os.listdir(subdir_path))

            # Filter metadata by filenames in the subfolder
            matched_df = metadata_df[metadata_df['filename'].isin(subfolder_filenames)]

            # Save filtered CSV in the same subfolder
            output_path = os.path.join(subdir_path, 'cluster_metadata.csv')
            matched_df.to_csv(output_path, index=False)
            print(f"Saved {len(matched_df)} records to {output_path}")

# Example usage
generate_cluster_csvs()


In [14]:
def load_similarity_dict_from_file(filepath):
    topic_dict = {}
    current_topic = None
    inside_block = False

    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            # Detect topic title
            if not inside_block and not line.startswith('{'):
                current_topic = line
                topic_dict[current_topic] = {}
                continue

            # Start of similarity dictionary block
            if line.startswith('{'):
                inside_block = True
                continue

            # End of similarity dictionary block
            if line.startswith('}'):
                inside_block = False
                current_topic = None
                continue

            # Key-value pairs
            if inside_block and ':' in line:
                try:
                    key, value = line.split(':', 1)
                    key = key.strip().strip("'\"")  # remove quotes
                    value = float(value.strip().rstrip(','))
                    topic_dict[current_topic][key] = value
                except ValueError:
                    print(f"Skipping invalid line: {line}")

    return topic_dict


In [21]:
import os
import matplotlib.pyplot as plt
import networkx as nx
import math
from matplotlib.lines import Line2D

def plot_similarity_graphs_from_dict(topic_dict, output_dir='output_graphs'):
    os.makedirs(output_dir, exist_ok=True)

    for topic, sim_dict in topic_dict.items():
        if not sim_dict:
            continue

        # Sort the items by similarity score in descending order
        sorted_items = sorted(sim_dict.items(), key=lambda x: x[1], reverse=True)

        # Select the top 20 pairs
        top_pairs = sorted_items[:20]

        # Create graph
        G = nx.Graph()

        for pair, similarity in top_pairs:
            key1, key2 = pair.split(' -> ')
            key1_renamed = key1.replace('cluster_', 'Acad_')
            key2_renamed = f"News_{key2}"

            G.add_node(key1_renamed, color='green')
            G.add_node(key2_renamed, color='red')
            G.add_edge(key1_renamed, key2_renamed, weight=similarity)

        # Layout and drawing
        pos = nx.spring_layout(G, seed=42)
        node_colors = [G.nodes[node]['color'] for node in G.nodes]

        plt.figure(figsize=(16, 12))
        nx.draw(G, pos, with_labels=True, node_color=node_colors,
                edge_color='gray', node_size=800, font_size=8)

        # Draw edge labels with similarity scores
        edge_labels = nx.get_edge_attributes(G, 'weight')
        formatted_edge_labels = {k: f"{v:.2f}" for k, v in edge_labels.items()}
        nx.draw_networkx_edge_labels(G, pos, edge_labels=formatted_edge_labels, font_size=6)

        # Title for the graph
        plt.title(f"Semantic Similarity Graph: {topic}", fontsize=14, fontweight='bold')

        # Prepare the legend
        legend_elements = [
            Line2D([0], [0], marker='o', color='w', label='Academic Cluster (Acad_i)',
                   markerfacecolor='green', markersize=10),
            Line2D([0], [0], marker='o', color='w', label='News Segment (News_i)',
                   markerfacecolor='red', markersize=10)
        ]

        # Add the top 20 pairs to the legend
        for idx, (pair, similarity) in enumerate(top_pairs):
            legend_elements.append(
                Line2D([0], [0], color='gray', label=f"{pair}: {similarity:.2f}", linestyle='-', linewidth=1)
            )

        # Automatically adjust the legend position to avoid overlap with the graph
        plt.tight_layout()

        # Add the legend with dynamic positioning
        legend = plt.legend(handles=legend_elements, fontsize=10, bbox_to_anchor=(1, 1), loc='upper left', frameon=True)
        
        # Save the figure to the specified path
        filename = os.path.join(output_dir, f"{topic.replace(' ', '_')}.png")
        plt.savefig(filename, bbox_inches='tight')
        plt.close()


In [22]:
# Step 1: Load the file
filepath = '/home/lenovo3/Desktop/Alvin/NUS_ISS/PaperMatch/Graph_Network/sim_score_graph_network.txt'
topic_dict = load_similarity_dict_from_file(filepath)

# Step 2: Generate graphs
plot_similarity_graphs_from_dict(topic_dict, output_dir='/home/lenovo3/Desktop/Alvin/NUS_ISS/PaperMatch/Graph_Network/output_graphs')


  plt.tight_layout()
