In [None]:
#%run AcademicCollabProject.py

from src.preprocessing import build_author_graph
from src.features import extract_features_from_graph
from src.model import train_model, evaluate_model, explain_model
from src.visualize import draw_collab_graph
from sklearn.model_selection import train_test_split

# Baseline models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

def run_baseline_models(X_train, X_test, y_train, y_test, output_path="baseline_results.txt"):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "SVM": SVC(),
        "Decision Tree": DecisionTreeClassifier()
    }

    results = []

    for name, clf in models.items():
        print(f"\n=== {name} ===")
        start_time = time.time()
        
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        
        elapsed = time.time() - start_time
        minutes, seconds = divmod(elapsed, 60)
        print(f"Training time: {int(minutes):02d}:{int(seconds):02d}")
    
        report = classification_report(y_test, preds)
        print(f"\n=== {name} ===")
        print(report)
        results.append(f"=== {name} ===\n{report}\n")

    # Save all results to a text file
    with open(output_path, "w") as f:
        f.writelines(results)

    print(f"\nBaseline model results saved to {output_path}")

import os
import pickle # For saving/loading graphs

"""
Focus on:
- Academic collaboration patterns
- Coauthorship, citation impact, publication behavior
- Classifying strong vs. weak collaborations

All of this can be modeled in a single unified author graph.
- No need to separate papers, authors, and fields into distinct .pk graphs unless doing:
- Multi-type message passing (e.g., with heterogeneous GNNs)
- Explicit field-level domain training
"""

def main():
    raw_data_dir = os.path.join("data", "raw")
    graph_cache = os.path.join("data", "author_graph.gpickle")
    model_path = "randomforest_model.pkl"
    vis_path = "top_authors_graph.png"

    import time
    # Load or build graph with timer
    start_time = time.time()

    G = None
    
    if os.path.exists(graph_cache):
        print("Loading graph from file...")
        with open(graph_cache, "rb") as f:
            G = pickle.load(f)
        print("Graph loaded from cache.")
    else:
        print("Building author collaboration graph...")

        papers_file = os.path.join(raw_data_dir, "Papers_CS_20190919.tsv")
        authors_file = os.path.join(raw_data_dir, "PAuAf_CS_20190919.tsv")
        citations_file = os.path.join(raw_data_dir, "PR_CS_20190919.tsv")
        G = build_author_graph(papers_file, authors_file, citations_file)
        with open(graph_cache, "wb") as f:
            pickle.dump(G, f)
        print("Graph saved to:", graph_cache)
    
    elapsed_time = time.time() - start_time
    hours, rem = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(rem, 60)
    print(f"Graph ready in (hh:mm:ss) {int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}")

    # Visualize
    print("Visualizing coauthor network...")
    if os.path.exists(vis_path):
        resp = input(f"Visualization already exists: {vis_path}\nDo you want to overwrite it? (y/n): ").strip().lower()
        if resp != "y":
            print("[User Prompt - Skipped visualization.]")
        else:
            draw_collab_graph(G, max_nodes=100, save_path=vis_path)
    else:
        draw_collab_graph(G, max_nodes=100, save_path=vis_path)

    # Feature extraction
    print("Extracting features for ML...")
    X, y = extract_features_from_graph(G)
    print(f"{len(X)} collaboration pairs extracted.")

    # Split data
    print("Splitting data into train and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Prompt for model choice
    choice = input("Train model or test an existing one? (train/test): ").strip().lower()

    if choice == "test" and os.path.exists(model_path):
        model = load_model(model_path)
        print("Evaluating model on test data...")
        evaluate_model(model, X_test, y_test)
        print("Explaining model via SHAP...")
        explain_model(model, X_test)
    elif choice == "train":
        print("Training will begin...")
    else:
        print(f"Model file not found at {model_path}.")
        retry = input("Would you like to train the model instead? (y/n): ").strip().lower()
        if retry == "y":
            choice = "train"
        else:
            print("Exiting. No model to test.")
            return

    # If retrain is picked
    if choice == "train":
        print("Training model...")
        model = train_model(X_train, y_train)
        save_model(model, model_path)

        print("Evaluating model on test data...")
        evaluate_model(model, X_test, y_test)

        print("Explaining model via SHAP...")
        explain_model(model, X_test)

        # Baseline comparison
        print("Running baseline models for comparison...")
        run_baseline_models(X_train, X_test, y_train, y_test)

    #Kmeans
    from src.kmeans_model import run_kmeans
    from sklearn.metrics import confusion_matrix
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    # Run KMeans after feature extraction
    kmeans, cluster_labels = run_kmeans(X, n_clusters=2)
    
    # Compare with RF predictions
    rf_preds = model.predict(X)
    cm = confusion_matrix(rf_preds, cluster_labels)
    
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("KMeans Cluster")
    plt.ylabel("RandomForest Prediction")
    plt.title("RF vs KMeans Confusion Matrix")
    plt.savefig("rf_vs_kmeans_confusion.png")
    plt.close()


if __name__ == "__main__":
    main()

Loading graph from file...
