In [29]:
import csv
from collections import deque

# Data structures
scientists = {}papers = {}

In [31]:
def load_data(directory):
    """
    Load data from CSV files into memory.
    """
    with open(f"{directory}scientists.csv", encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            scientists[row["scientist_id"]] = {
                "name": row["name"],
                "papers": set()
            }
    
    with open(f"{directory}papers.csv", encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            papers[row["paper_id"]] = {
                "title": row["title"],
                "year": row["year"],
                "authors": set()
            }
    
    with open(f"{directory}authors.csv", encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            scientist_id = row["scientist_id"]
            paper_id = row["paper_id"]
            if scientist_id in scientists:
                scientists[scientist_id]["papers"].add(paper_id)
            if paper_id in papers:
                papers[paper_id]["authors"].add(scientist_id)


In [33]:
def neighbors_for_person(scientist_id):
    """
    Returns (paper_id, co-author_id) pairs for a given scientist.
    """
    neighbors = set()
    for paper_id in scientists[scientist_id]["papers"]:
        for coauthor_id in papers[paper_id]["authors"]:
            if coauthor_id != scientist_id:
                neighbors.add((paper_id, coauthor_id))
    return neighbors


In [35]:
def shortest_path(source_id, target_id):
    """
    Returns the shortest list of (paper_id, scientist_id) pairs
    that connect the source to the target.
    If no path, returns None.
    """
    frontier = deque()
    frontier.append((source_id, []))
    explored = set()

    while frontier:
        current_id, path = frontier.popleft()
        explored.add(current_id)

        for paper_id, neighbor_id in neighbors_for_person(current_id):
            if neighbor_id == target_id:
                return path + [(paper_id, neighbor_id)]

            if neighbor_id not in explored:
                frontier.append((neighbor_id, path + [(paper_id, neighbor_id)]))
                explored.add(neighbor_id)

    return None


In [37]:
def main():
    data_choice = input("Small or Large dataset? (small/large): ").strip().lower()

    if data_choice == "small":
        directory = "DataSmall"
    elif data_choice == "large":
        directory = "DataLarge"
    else:
        print("Invalid choice.")
        return
    
    load_data(directory)

    source_name = input("Enter the name of the source scientist: ").strip()
    target_name = input("Enter the name of the target scientist: ").strip()

    # Find IDs
    source_id = None
    target_id = None

    for scientist_id, info in scientists.items():
        if info["name"].lower() == source_name.lower():
            source_id = scientist_id
        if info["name"].lower() == target_name.lower():
            target_id = scientist_id

    if source_id is None or target_id is None:
        print("Scientist not found.")
        return

    path = shortest_path(source_id, target_id)

    if path is None:
        print("No connection found.")
    else:
        degrees = len(path)
        print(f"{degrees} degrees of separation.")
        current_id = source_id
        for i, (paper_id, scientist_id) in enumerate(path, 1):
            paper = papers[paper_id]["title"]
            scientist = scientists[scientist_id]["name"]
            print(f"{i}: {scientists[current_id]['name']} and {scientist} co-authored \"{paper}\"")
            current_id = scientist_id


In [39]:
main()


Small or Large dataset? (small/large):  small
Enter the name of the source scientist:  Elı́as Campo
Enter the name of the target scientist:  Jun Zhang


KeyError: 'A5074953707'