In [14]:
import pandas as pd
import networkx as nx

def getReciprocal(n, d):
    if n != 0:
        return d/n
    else:
        return 0

class Author(object):
    """
    A class to represent an author and their metadata.

    Attributes:
        firstName (str): The author's first name.
        lastName (str): The author's last name.
        country (str): The author's affiliation country.
        country_code (str): The author's affiliation country code (e.g., 'USA').
        city (str): The author's affiliation city.
        gender (str): The author's gender.
        paperList (list): A list of publication IDs the author has contributed to.
        specialization (str): The author's field of study.
    """
    def __init__(self, paper, first, last, country, country_code, aff_city, gender, specialization):
        self.firstName = first
        self.lastName = last
        self.country = country
        self.country_code = country_code
        self.city = aff_city
        self.gender = gender
        self.paperList = []
        self.specialization = specialization
        if paper:
            self.paperList.append(paper)

    def __eq__(self, other):
        """
        Defines two authors as equal if their key metadata attributes match.
        Equality is based on name, gender, country code, and specialization.
        """
        if (isinstance(other, Author)):
            # Using your full equality check
            return (self.firstName == other.firstName and
                    self.lastName == other.lastName and
                    self.gender == other.gender and
                    self.country_code == other.country_code and
                    self.specialization == other.specialization)
        else:
            return False

    def __hash__(self):
        """
        Generates a hash for the Author object.
        Note: The hash should be based on the same attributes used in __eq__.
        """
        return hash((self.firstName, self.lastName, self.gender, 
                     self.country_code, self.specialization))
        
    def getName(self):
        return self.firstName + " " + self.lastName

def collectAuthorsOfOnePaper(df, pub_id, **kwargs):
    """
    Retrieves all authors of a specific paper from the main DataFrame.

    Args:
        df (pd.DataFrame): The main dataframe of publications.
        pub_id (str): The unique identifier for the paper.
        **kwargs:
            refAuthor (Author, optional): An author to exclude from the list.

    Returns:
        list: A list of Author objects who co-authored the paper.
    """
    refAuthor = kwargs.get('refAuthor', None)
    authorList = []
    # Using your logic to iterate from a start point if provided, but simplifying for general case
    paper_df = df[df['pub_id'] == pub_id]
    for _, row in paper_df.iterrows():
        author = Author(
            row["pub_id"], row["first_name"], row["last_name"],
            row["aff_country"], row["aff_country_code"], row["aff_city"],
            row["gender"], row["specialization"]
        )
        if (author != refAuthor):
            authorList.append(author)
    return authorList

def searchAuthorPapers(df, author):
    """
    Retrieves all authors of a specific paper from the main DataFrame.

    Args:
        df (pd.DataFrame): The main dataframe of publications.
        pub_id (str): The unique identifier for the paper.
        **kwargs:
            refAuthor (Author, optional): An author to exclude from the list.

    Returns:
        list: A list of Author objects who co-authored the paper.
    """
    paperDict = {}
    # Find all rows in the dataframe corresponding to the reference author
    author_rows = df[(df['first_name'] == author.firstName) & (df['last_name'] == author.lastName)]
    # For each unique paper they wrote...
    for pub_id in author_rows['pub_id'].unique():
        # ...find all their collaborators on that paper.
        authorList = collectAuthorsOfOnePaper(df, pub_id, refAuthor=author)
        clean_pub_id = pub_id.replace("pub.", "")
        paperDict[clean_pub_id] = authorList
    return paperDict

def create_graph(dict_x, **kwargs):
    """
    Creates a NetworkX graph from a dictionary of collaborations.

    Args:
        dict_x (dict): A dictionary where keys are nodes (e.g., paper IDs) and
                       values are lists of nodes they connect to (e.g., authors).

    Returns:
        nx.Graph: A NetworkX graph object representing the collaboration network.
    """
    # Using your original graph creation logic
    refAuthor = kwargs.get('refAuthor', None)
    if refAuthor != None:
        for key in dict_x.keys():
            dict_x[key].append(refAuthor)
    G = nx.from_dict_of_lists(dict_x)
    return G

def binaryCalculation(refAuthorFeature, collabFeature, baseFactor):
    if refAuthorFeature == collabFeature and baseFactor != 0:
        return 1/baseFactor
    else:
        return 0

def processCategoricalCalculation(collabCategory, baseFactor, countDict):
    if collabCategory not in countDict.keys():
        countDict[collabCategory] = baseFactor
    else:
        countDict[collabCategory] += baseFactor

def isWeightedCalculation(authorCategory, categoricalWeights):
    if categoricalWeights is not None:
        weightedCategories = {j for i in categoricalWeights.values() for j in i}
        if authorCategory in list(weightedCategories):
            return True
    return False


def calculateCIndex(author, collabDict, collabGraph, **kwargs):
    """
    Calculates the 'Community Index' (C-Index) for a single author.

    This index is an aggregation of diversity scores across three dimensions:
    gender, nationality, and specialization. The score for each paper is calculated,
    and the final index is the average across all of an author's papers.

    The calculation can optionally include a bonus for collaborating with "new" authors,
    defined as authors who have only appeared on one paper in the reference author's network.

    Args:
        author (Author): The reference author for whom the index is calculated.
        collabDict (dict): The author's collaboration dictionary from searchAuthorPapers.
        collabGraph (nx.Graph): The author's collaboration network graph.
        **kwargs:
            crossPaper (bool): If True, apply a bonus for new collaborators.
            newBonus (float): The bonus multiplier for new collaborators.
            baseGenderFactor (float): Base weight for gender diversity calculation.
            baseNationalityBonus (float): Base weight for nationality diversity.
            baseSpecializationFactor (float): Base weight for specialization diversity.
            categoricalWeights (dict): Optional weights for specific countries or fields.

    Returns:
        tuple: (rounded final index, unrounded final index, detailed report list)
    """
    # --- Parameter Setup ---
    crossPaper = kwargs.get('crossPaper', False)
    newBonus = kwargs.get('newBonus', 0.8)
    # The 'isNew' parameter defines a "new" collaborator by their degree in the graph.
    # A degree of 1 means they only appear on one paper in this author's network.
    isNew = 1 

    # Base factors for diversity calculations
    baseGenderFactor = kwargs.get('baseGenderFactor', 1)
    baseNationalityBonus = kwargs.get('baseNationalityBonus', 1)
    baseSpecializationFactor = kwargs.get('baseSpecializationFactor', 1)
    categoricalWeights = kwargs.get('categoricalWeights', None)

    paperFeatureIndices = []
    paper_details_for_reporting = [] 

    # --- Calculation Loop: Iterate through each paper the author has written ---
    for publication in collabDict.keys():
        # Initialize diversity scores for this specific paper
        # Gender: Starts with the author's own contribution
        genderFactor = 1 * baseGenderFactor
        # Nationality/Specialization: Track counts of each category
        nationalityBonus = 0
        nationalityCounts = {author.country_code : 1 * baseNationalityBonus}
        specializationFactor = 0
        specializationCounts = {author.specialization : 1 * baseSpecializationFactor}

        # --- Loop through collaborators on this paper ---
        for collab in collabDict[publication]:
            bonus = 1
            # Apply a bonus if the collaborator is "new" to the network            
            if collabGraph.degree[collab] == isNew and crossPaper == True:
                bonus += newBonus
            
            # --- Update diversity scores based on this collaborator ---
            # Gender: Adds to the score if genders are different
            genderFactor += binaryCalculation(author.gender, collab.gender, baseGenderFactor*bonus)
            # Nationality & Specialization: Tally the counts for each category
            processCategoricalCalculation(collab.country_code,
                                          baseNationalityBonus*bonus,
                                          nationalityCounts)
            processCategoricalCalculation(collab.specialization,
                                          baseSpecializationFactor*bonus,
                                          specializationCounts)

        # --- Finalize Diversity Factors for THIS paper ---
        # The logic combines two ideas:
        # 1. Variety: The number of unique categories (e.g., `len(set(nationalityCounts.keys()))`)
        # 2. Balance: A weight that rewards even distribution and penalizes self-concentration.

        # Gender Factor: A simple reciprocal of the sum.       
        final_gender_factor = getReciprocal(genderFactor, len(collabDict[publication]))
        # Nationality Factor        
        nationality_denominator = sum(nationalityCounts.values()) - baseNationalityBonus
        nationality_weight = getReciprocal(nationalityCounts.get(author.country_code, 0), nationality_denominator)
        final_nationality_factor = len(set(nationalityCounts.keys())) * nationality_weight

        # Specialization Factor
        specialization_denominator = sum(specializationCounts.values()) - baseSpecializationFactor
        specialization_weight = getReciprocal(specializationCounts.get(author.specialization, 0), specialization_denominator)
        final_specialization_factor = len(set(specializationCounts.keys())) * specialization_weight
        
        # Apply any specific external weights (e.g., for underrepresented countries)
        if isWeightedCalculation(author.country_code, categoricalWeights):
            final_nationality_factor *= categoricalWeights["nationality"][author.country_code]
        if isWeightedCalculation(author.specialization, categoricalWeights):
            final_specialization_factor *= categoricalWeights["specialization"][author.specialization]
        
        # The index for this paper is the sum of the three diversity factors
        paper_index = final_gender_factor + final_nationality_factor + final_specialization_factor
        paperFeatureIndices.append(paper_index)

        # Store detailed results for reporting
        paper_details_for_reporting.append({
            "pub_id": f"pub.{publication}",
            "Gender Factor": final_gender_factor,
            "Nationality Factor": final_nationality_factor,
            "Specialization Factor": final_specialization_factor,
            "Paper Index": paper_index
        })
    
    # --- Aggregate across all papers ---
    # The final D-Index is the average of the paper indices.        
    unrounded_index = sum(paperFeatureIndices) / len(paperFeatureIndices)
    final_index = round(sum(paperFeatureIndices) / len(paperFeatureIndices))
    return final_index, unrounded_index, paper_details_for_reporting


def find_best_example_authors(df):
    """
    Analyzes the dataframe to find authors with high and low collaborator repeat rates.
    Returns the names of the best candidates for demonstration.
    """
    unique_authors = df.drop_duplicates(subset=['first_name', 'last_name'])
    author_stats = []
    for _, author_row in unique_authors.iterrows():
        temp_author = Author(None, author_row.first_name, author_row.last_name, None, None, None, None, None)
        papers = searchAuthorPapers(df, temp_author)
        if len(papers) < 2: continue # Focus on authors with multiple papers for better patterns
        all_collaborators = [collab.getName() for sublist in papers.values() for collab in sublist]
        if not all_collaborators: continue
        num_unique_collaborators = len(set(all_collaborators))
        avg_repeat_rate = len(all_collaborators) / num_unique_collaborators
        author_stats.append({"name": temp_author.getName(), "avg_repeat_rate": avg_repeat_rate})

    if not author_stats: return None, None
    sorted_stats = sorted(author_stats, key=lambda x: x['avg_repeat_rate'])
    high_newness_author_name = sorted_stats[0]['name']   # Low repeat rate -> high newness
    high_repeat_author_name = sorted_stats[-1]['name'] # High repeat rate
    return high_repeat_author_name, high_newness_author_name


def generate_comparison_report(author, df):
    """
    Generates and prints a formatted report for a given author, comparing
    C-Index with and without the cross-paper new author bonus
    """
    print("-" * 95)
    print(f"C-Index Comparison Report for: {author.getName()} ({author.specialization})")
    print("-" * 95)

    papers_dict = searchAuthorPapers(df, author)
    # The graph must be created from the author list *without* the refAuthor for degree to be correct
    graph_for_degree_check = create_graph(papers_dict)

    # --- Run calculations ---
    index_no_bonus, details_no_bonus = calculateCIndex(author, papers_dict, graph_for_degree_check, crossPaper=False)
    index_with_bonus, details_with_bonus = calculateCIndex(author, papers_dict, graph_for_degree_check, crossPaper=True)

    print(f"Overall C-Index (No Bonus): {index_no_bonus}")
    print(f"Overall C-Index (With New Author Bonus): {index_with_bonus}\n")
    print("Detailed Breakdown (Displaying values with bonus applied):")
    header = f"{'Pub ID':<17} | {'Gender':>12} | {'Nationality':>15} | {'Specialization':>16} | {'Paper Index':>15}"
    print(header); print("-" * len(header))
    no_bonus_indices = {d['pub_id']: d['Paper Index'] for d in details_no_bonus}

    for paper_detail in details_with_bonus:
        pub_id = paper_detail['pub_id']
        note = "<- Bonus Applied" if abs(paper_detail['Paper Index'] - no_bonus_indices.get(pub_id, 0)) > 0.01 else ""
        row = (f"{pub_id:<17} | {paper_detail['Gender Factor']:>12.2f} | "
               f"{paper_detail['Nationality Factor']:>15.2f} | {paper_detail['Specialization Factor']:>16.2f} | "
               f"{paper_detail['Paper Index']:>15.2f} {note}")
        print(row)
    print("-" * 95 + "\n")


def generate_full_paper_report(pub_id, df):
    """
    Finds every author on a given paper and generates a C-Index report for each one.
    This is used to create comprehensive summary tables.
    """
    print(f"\n\n{'='*45}")
    print(f" Full C-Index Report for All Authors on: {pub_id}")
    print(f"{'='*45}\n")

    authors_on_paper = collectAuthorsOfOnePaper(df, pub_id)

    if not authors_on_paper:
        print(f"No authors found for {pub_id} or paper does not exist.")
        return

    for author in authors_on_paper:
        generate_comparison_report(author, df)

In [15]:
def generate_table_for_author_cohort(author_cohort, df):
    """
    Generates a single, author-centric table for a specific, predefined
    list of author objects, showing all papers for each of them.
    This version uses the UNROUNDED baseline c-index.
    """
    all_rows_data = []

    print(f"Building table for a cohort of {len(author_cohort)} authors...")

    # Loop through each hand-picked author
    for author in author_cohort:
        # Calculate their profile based on their full history
        papers_dict = searchAuthorPapers(df, author)
        if not papers_dict: continue # Skip if author has no other papers found

        graph = create_graph(papers_dict)

        # <<< CHANGE 1: Capture the EXACT (unrounded) baseline c-index >>>
        # The second return value from calculateCIndex is the unrounded float.
        _, c_index_baseline_exact, _ = calculateCIndex(author, papers_dict, graph, crossPaper=False)
        _, c_index_bonus_exact, details_bonus = calculateCIndex(author, papers_dict, graph, crossPaper=True)

        # Now, create a row for EACH paper this author has written
        for paper_details in details_bonus:
            row_data = {
                'Pub id': paper_details['pub_id'].replace("pub.", ""),
                'Name': author.firstName,
                'Country': author.country,
                'Gender': 'M' if author.gender == 'male' else 'F',
                'Field': author.specialization.replace("Science", "Sci").replace("Healthcare", "Health"),
                'c-index (Bonus)': c_index_bonus_exact,
                'Country factor': paper_details['Nationality Factor'],
                'Gender factor': paper_details['Gender Factor'],
                'Field factor': paper_details['Specialization Factor'],
                'Paper factor': paper_details['Paper Index'],
                # <<< CHANGE 2: Use the exact baseline c-index in the data >>>
                'c-index (Baseline)': c_index_baseline_exact,
            }
            all_rows_data.append(row_data)

    if not all_rows_data:
        print("No data to generate.")
        return None

    # Create and format the final DataFrame
    final_df = pd.DataFrame(all_rows_data)
    final_df.rename(columns={'c-index (Bonus)': 'c-index', 'c-index (Baseline)': 'c-index '}, inplace=True)
    final_df.sort_values(by=['Name', 'Pub id'], inplace=True)

    # Use Pandas Styler to format the output for display
    # <<< CHANGE 3: Add formatting for the baseline c-index column >>>
    styled_df = final_df.style.format({
        'c-index': "{:.2f}",
        'c-index ': "{:.2f}", # Format the baseline c-index to 2 decimal places
        'Country factor': "{:.2f}",
        'Gender factor': "{:.2f}",
        'Field factor': "{:.2f}",
        'Paper factor': "{:.2f}",
    }).set_properties(**{'text-align': 'left'}).set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

    return styled_df

In [16]:
# MAIN EXECUTION BLOCK FOR JUPYTER NOTEBOOK
# Load the data
df = pd.read_csv('sampleTableV3.csv')

# --- PART 1: Define our author cohort by selecting them from a single, representative paper ---
# This is the most robust way to get the exact group we want to analyze.
pub_id_for_cohort = "pub.1123345821"
author_cohort = collectAuthorsOfOnePaper(df, pub_id_for_cohort)

author_names = [author.getName() for author in author_cohort]
print(f"--- Building a focused table based on a specific author cohort ---")
print(f"Selected {len(author_cohort)} authors from paper '{pub_id_for_cohort}': {author_names}\n")

final_table = generate_table_for_author_cohort(author_cohort, df)
if final_table:
    display(final_table)

--- Building a focused table based on a specific author cohort ---
Selected 6 authors from paper 'pub.1123345821': ['Adam Smith', 'Emily Johnson', 'Robert Brown', 'Maria Garcia', 'David Williams', 'Sophia Davis']

Building table for a cohort of 6 authors...


Unnamed: 0,Pub id,Name,Country,Gender,Field,c-index,Country factor,Gender factor,Field factor,Paper factor,c-index.1
2,540609372,Adam,Italy,M,Health,29.6,12.0,1.5,12.0,25.5,27.56
0,1123345821,Adam,Italy,M,Health,29.6,26.4,1.96,9.43,37.79,27.56
1,1319112586,Adam,Italy,M,Health,29.6,12.0,1.5,12.0,25.5,27.56
13,1123345821,David,United States,M,Health,23.05,7.83,2.37,12.86,23.05,18.33
5,540609372,Emily,Cuba,F,Computer Sci,29.6,12.0,1.5,12.0,25.5,27.56
3,1123345821,Emily,Cuba,F,Computer Sci,29.6,26.4,1.96,9.43,37.79,27.56
4,1319112586,Emily,Cuba,F,Computer Sci,29.6,12.0,1.5,12.0,25.5,27.56
12,540609372,Maria,Mexico,F,Social Sci,33.49,12.0,1.5,12.0,25.5,27.5
9,1123345821,Maria,Mexico,F,Social Sci,33.49,26.4,1.96,26.4,54.76,27.5
11,1319112586,Maria,Mexico,F,Social Sci,33.49,12.0,1.5,12.0,25.5,27.5
