### Code needed to clean the csv datasets

In [78]:
import os
import csv
import pandas as pd

# Create the output directory if it doesn't exist
os.makedirs('Dataset/Cleaned', exist_ok=True)

# === File paths ===
meetings_in = "Dataset/Original/Montagna_Meetings_Edgelist.csv"
calls_in = "Dataset/Original/Montagna_Phone_Calls_Edgelist.csv"
roles_in = "Dataset/Original/Montagna_Roles.csv"

meetings_out = "Dataset/Cleaned/Montagna_Meetings_Clean.csv"
calls_out = "Dataset/Cleaned/Montagna_Phone_Calls_Clean.csv"
roles_out = "Dataset/Cleaned/Montagna_Roles_Clean.csv"

In [79]:
def clean_edgelist(input_file, output_file, delimiter_in):
    with open(input_file, "r", encoding="utf-8") as infile, \
         open(output_file, "w", encoding="utf-8", newline='') as outfile:

        reader = csv.reader(infile, delimiter=delimiter_in)
        writer = csv.writer(outfile)

        # Skip the header
        next(reader)

        for row in reader:
            if not row or len(row) < 3:
                print(f"Skipping invalid row: {row}")
                continue
            try:                # Remove 'N' prefix and convert to int for Source and Target
                row[0] = int(row[0].replace("N", ""))
                row[1] = int(row[1].replace("N", ""))
            except ValueError:  # If conversion fails, keep original
                pass
            # Weight should stay as is or convert to int
            row[2] = int(row[2])
            writer.writerow(row)

In [80]:
# === CLEAN ROLES DATA ===
def clean_roles(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8', newline='') as outfile:

        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        next(reader)  # Skip header (Node,Role,Relationship,Request)

        for row in reader:
            node = row[0].replace('N', '') if row[0] else ''
            role = row[1] if len(row) > 1 else ''
            relationship = row[2] if len(row) > 2 else ''
            request = row[3] if len(row) > 3 else ''
            writer.writerow([node, role, relationship, request])

In [81]:
# === Report & clean duplicates and self-loops ===
def clean_graph_data(file_path, name):
    df = pd.read_csv(file_path, header=None, names=["Source", "Target", "Weight"])

    print(f"\n--- {name} ---")
    print(f"Initial edges: {len(df)}")

    # Convert to int (if read from CSV as strings)
    df["Source"] = df["Source"].astype(int)
    df["Target"] = df["Target"].astype(int)
    df["Weight"] = df["Weight"].astype(int)

    # === Find and print self-loops ===
    self_loops = df[df["Source"] == df["Target"]]
    print(f"Self-loops found: {len(self_loops)}")
    if not self_loops.empty:
        print(self_loops)

    # === Find and print duplicate edges (unordered pairs) ===
    df['key'] = df.apply(lambda row: tuple(sorted((row["Source"], row["Target"]))), axis=1)
    duplicates = df.duplicated(subset='key', keep=False)
    duplicate_rows = df[duplicates]
    print(f"Duplicate edges found: {len(duplicate_rows)}")
    if not duplicate_rows.empty:
        print(duplicate_rows.sort_values(by="key"))

    # === Remove self-loops ===
    df = df[df["Source"] != df["Target"]]

    # === Aggregate duplicate edges by summing weights ===
    df_cleaned = df.groupby(['key'], as_index=False).agg({'Weight': 'sum'})
    df_cleaned[['Source', 'Target']] = pd.DataFrame(df_cleaned['key'].tolist(), index=df_cleaned.index)
    df_cleaned = df_cleaned[['Source', 'Target', 'Weight']]

    print(f"Cleaned edges: {len(df_cleaned)}")
    return df_cleaned

### Cleaning process

In [82]:
# === Step 1: Clean raw CSVs (removing 'N' prefix, header, and fixing types) ===
clean_edgelist(meetings_in, meetings_out, delimiter_in=' ')
clean_edgelist(calls_in, calls_out, delimiter_in=',')
clean_roles(roles_in, roles_out)

In [83]:
# === Step 2: Report and remove self-loops and duplicates ===
df_meetings_final = clean_graph_data(meetings_out, "Meetings Network")
df_calls_final = clean_graph_data(calls_out, "Phone Calls Network")


--- Meetings Network ---
Initial edges: 289
Self-loops found: 1
     Source  Target  Weight
179      47      47       1
Duplicate edges found: 78
     Source  Target  Weight       key
14        5       6       1    (5, 6)
63        6       5       1    (5, 6)
56       12      11       1  (11, 12)
29       11      12       9  (11, 12)
30       11      13       1  (11, 13)
..      ...     ...     ...       ...
267      89      70       3  (70, 89)
268      93      70       2  (70, 93)
250      70      93       2  (70, 93)
266      89      93       2  (89, 93)
247      93      89       6  (89, 93)

[78 rows x 4 columns]
Cleaned edges: 248

--- Phone Calls Network ---
Initial edges: 150
Self-loops found: 0
Duplicate edges found: 58
     Source  Target  Weight         key
1        18      19       5    (18, 19)
2        19      18       6    (18, 19)
13       18      21       2    (18, 21)
14       21      18       1    (18, 21)
19       18      22       4    (18, 22)
61       22      18  

In [84]:
# === Step 3: Save cleaned data back ===
df_meetings_final.to_csv(meetings_out, index=False, header=False)
df_calls_final.to_csv(calls_out, index=False, header=False)