In [None]:
#Task A network construction
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
# Selecting three different scale files (update file paths as needed)
files = {
    "large": "data/datasets/PROJECT_CHAT.csv",  # Large dataset
    "medium": "data/datasets/ADMINISTRATORS.CSV",        # Medium dataset
    "small": "data/datasets/INTERWIKI_CONFLICT.csv"     # Small dataset
}

In [None]:
# Function to load data from CSV
def load_data(file_path):
    """ Load CSV file and return a DataFrame """
    df = pd.read_csv(file_path)
    df.columns = ["thread_subject", "username", "page_name"]  # Ensure consistent column names
    return df

In [None]:
# Function to build social network
def build_network(df):
    """ Construct a social network graph from the DataFrame """
    G = nx.Graph()
    
    # Group by page and thread
    grouped = df.groupby(["page_name", "thread_subject"])
    
    for (page, thread), group in grouped:
        users = group["username"].unique()  # Get unique users in the thread
        for i in range(len(users)):
            for j in range(i + 1, len(users)):
                G.add_edge(users[i], users[j])  # Create an edge between users commenting in the same thread
    
    return G

In [None]:
# Function to analyze network properties
def analyze_network(G):
    """ Compute and display basic network metrics """
    print(f"Number of nodes (users): {G.number_of_nodes()}")
    print(f"Number of edges (connections): {G.number_of_edges()}")
    print(f"Network density: {nx.density(G):.4f}")
    
    # Compute the largest connected component
    largest_cc = max(nx.connected_components(G), key=len)
    print(f"Size of the largest connected component: {len(largest_cc)}")

In [None]:
# Function to visualize a subset of the network
def plot_network(G, title="Wikidata Editor Network"):
    """ Visualize part of the Wikidata editor social network """
    plt.figure(figsize=(10, 6))
    sub_G = G.subgraph(list(G.nodes)[:200])  # Draw only the first 50 nodes for simplicity
    pos = nx.spring_layout(sub_G)
    nx.draw(sub_G, pos, with_labels=False, node_size=50)  # Removed labels for cleaner visualization
    plt.title(title)
    plt.show()

In [None]:
# Process three different scale datasets
networks = {}
for size, file in files.items():
    print(f"\nProcessing {size} scale dataset: {file}")
    df = load_data(file)
    G = build_network(df)
    analyze_network(G)
    networks[size] = G  # Store network
    plot_network(G, title=f"{size.capitalize()} Network")