In [1]:
import pandas as pd
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# === 1. Load the data ===

# Adjust the path if needed (e.g. "data/lump.xlsx")
FILE_PATH = "lump.xlsx"

In [2]:
df = pd.read_excel(FILE_PATH)

# Normalize column names a bit (strip spaces)
df.columns = [c.strip() for c in df.columns]

# Identify columns
reviewer_col = "Reviewers verdict"

# All LLM columns are the verdict columns except the reviewer one and Paper ID
llm_cols = [
    c for c in df.columns
    if c not in ("Paper ID", reviewer_col)
]

print("LLM columns detected:", llm_cols)

# Make everything lowercase strings for safety
df[reviewer_col] = df[reviewer_col].astype(str).str.strip().str.lower()
for c in llm_cols:
    df[c] = df[c].astype(str).str.strip().str.lower()


# === 2. Per-model agreement with reviewers ===

print("\n=== Per-model agreement with reviewers ===")
for model_col in llm_cols:
    agrees = (df[model_col] == df[reviewer_col])
    accuracy = agrees.mean()
    print(f"{model_col}: {accuracy:.3f} agreement with reviewers "
          f"({agrees.sum()}/{len(df)})")


# === 3. Majority consensus among LLMs and agreement with reviewers ===

def compute_majority_vote(row, model_columns):
    """
    Given a row and list of LLM columns, return:
      - majority_label (str) if a strict majority exists
      - None otherwise
    """
    votes = [row[c] for c in model_columns]
    # Remove empty/NA-like strings if any
    votes = [v for v in votes if isinstance(v, str) and v != ""]
    if not votes:
        return None

    counts = Counter(votes)
    label, count = counts.most_common(1)[0]
    if count > len(votes) / 2.0:
        return label  # strict majority
    return None      # no strict majority


# Compute majority vote for each row
df["LLM_majority"] = df.apply(
    lambda row: compute_majority_vote(row, llm_cols),
    axis=1
)

df["Has_majority"] = df["LLM_majority"].notna()

# Compare majority vote to reviewers’ verdict
df["Majority_agrees_with_reviewer"] = (
    df["Has_majority"] &
    (df["LLM_majority"] == df[reviewer_col])
)

# === 4. Summary statistics ===

total_papers = len(df)
with_majority = df["Has_majority"].sum()

print("\n=== Majority consensus vs reviewers ===")
print(f"Total papers: {total_papers}")
print(f"Papers with an LLM majority decision: {with_majority} "
      f"({with_majority / total_papers:.3f})")

if with_majority > 0:
    agrees_count = df["Majority_agrees_with_reviewer"].sum()
    print(
        f"Among papers with an LLM majority:\n"
        f"  Majority agrees with reviewers on {agrees_count}/{with_majority} "
        f"({agrees_count / with_majority:.3f})"
    )
else:
    print("No rows had a strict majority among LLMs.")

# Optional: inspect a few rows where majority disagrees
disagreements = df[
    df["Has_majority"] &
    (df["LLM_majority"] != df[reviewer_col])
]

print("\nExample rows where LLM majority disagrees with reviewers:")
print(disagreements.head())


LLM columns detected: ['ChatGPT verdict', 'Claude verdict', 'DeepSeek verdict', 'Gemini verdict', 'Mistral verdict']

=== Per-model agreement with reviewers ===
ChatGPT verdict: 0.868 agreement with reviewers (434/500)
Claude verdict: 0.868 agreement with reviewers (434/500)
DeepSeek verdict: 0.954 agreement with reviewers (477/500)
Gemini verdict: 0.844 agreement with reviewers (422/500)
Mistral verdict: 0.706 agreement with reviewers (353/500)

=== Majority consensus vs reviewers ===
Total papers: 500
Papers with an LLM majority decision: 500 (1.000)
Among papers with an LLM majority:
  Majority agrees with reviewers on 433/500 (0.866)

Example rows where LLM majority disagrees with reviewers:
   Paper ID Reviewers verdict ChatGPT verdict Claude verdict DeepSeek verdict  \
1         7           exclude         exclude        include          exclude   
2        13           include         exclude        exclude          exclude   
4        22           exclude         include       

In [2]:
df = pd.read_excel(FILE_PATH)

df.columns = [c.strip() for c in df.columns]

reviewer_col = "Reviewers verdict"
llm_cols = [c for c in df.columns if c not in ("Paper ID", reviewer_col)]

# Normalize
df[reviewer_col] = df[reviewer_col].str.lower().str.strip()
for c in llm_cols:
    df[c] = df[c].str.lower().str.strip()




WEIGHTS = {
    "DeepSeek verdict": {"include": 1.0, "exclude": 3.0},
    "Mistral verdict":  {"include": 3.0, "exclude": 1.0},

    # Balanced models → weight both classes equally
    "ChatGPT verdict":  {"include": 1.0, "exclude": 1.0},
    "Claude verdict":   {"include": 1.0, "exclude": 1.0},
    "Gemini verdict":   {"include": 1.0, "exclude": 1.0},
}


# === 3. Weighted voting function ===
def weighted_vote(row, llm_columns, weight_table):
    include_score = 0.0
    exclude_score = 0.0

    for model in llm_columns:
        verdict = row[model]
        if verdict == "include":
            include_score += weight_table[model]["include"]
        elif verdict == "exclude":
            exclude_score += weight_table[model]["exclude"]

    if include_score > exclude_score:
        return "include"
    elif exclude_score > include_score:
        return "exclude"
    else:
        return None


# === 4. Compute weighted consensus ===
df["Weighted_majority"] = df.apply(
    lambda r: weighted_vote(r, llm_cols, WEIGHTS),
    axis=1
)

df["Weighted_agrees_with_reviewer"] = (
    df["Weighted_majority"] == df[reviewer_col]
)


# === 5. Summary ===
total = len(df)
agree = df["Weighted_agrees_with_reviewer"].sum()

print("\n=== Weighted Majority Accuracy ===")
print(f"Weighted majority agrees with reviewers on {agree}/{total} "
      f"({agree/total:.3f})")

print("\n=== Rows where weighted majority disagrees with reviewers ===")
print(df[df["Weighted_majority"] != df[reviewer_col]].head())


=== Weighted Majority Accuracy ===
Weighted majority agrees with reviewers on 433/500 (0.866)

=== Rows where weighted majority disagrees with reviewers ===
   Paper ID Reviewers verdict ChatGPT verdict Claude verdict DeepSeek verdict  \
1         7           exclude         exclude        include          exclude   
2        13           include         exclude        exclude          exclude   
4        22           exclude         include        include          include   
5        34           exclude         include        exclude          exclude   
7        45           include         include        exclude          exclude   

  Gemini verdict Mistral verdict Weighted_majority  \
1        include         include           include   
2        exclude         exclude           exclude   
4        include         include           include   
5        include         include           include   
7        exclude         include           exclude   

   Weighted_agrees_with_review

In [3]:

# === 1. Load data ===
df = pd.read_excel(FILE_PATH)
df.columns = [c.strip() for c in df.columns]

reviewer_col = "Reviewers verdict"
llm_cols = [c for c in df.columns if c not in ("Paper ID", reviewer_col)]

# Clean values
def clean(v):
    if pd.isna(v): return ""
    return str(v).strip().lower()

df[reviewer_col] = df[reviewer_col].apply(clean)
for c in llm_cols:
    df[c] = df[c].apply(clean)

# Convert include/exclude to 1/0
mapping = {"include": 1, "exclude": 0}
y = df[reviewer_col].map(mapping)

X = df[llm_cols].applymap(lambda v: mapping.get(v, 0))


# === 2. Fit logistic regression to learn optimal weights ===
model = LogisticRegression()
model.fit(X, y)

# Extract learned optimal weights
weights = dict(zip(llm_cols, model.coef_[0]))
bias = model.intercept_[0]

print("\n=== Optimal Weights Learned from Data ===")
for col, w in weights.items():
    print(f"{col:20s}  weight = {w:.3f}")

print(f"Bias term: {bias:.3f}")


# === 3. Predict using optimized ensemble ===
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)

print("\n=== Accuracy of Optimal Weighted Ensemble ===")
print(f"Accuracy: {accuracy:.3f}  ({sum(y_pred==y)}/{len(y)})")


# Optional: Show where the model disagrees with reviewers
disagreements = df[y_pred != y]
print("\nRows where optimized ensemble disagrees with reviewers:")
print(disagreements.head())



=== Optimal Weights Learned from Data ===
ChatGPT verdict       weight = 0.823
Claude verdict        weight = 0.760
DeepSeek verdict      weight = 0.478
Gemini verdict        weight = 0.026
Mistral verdict       weight = 1.045
Bias term: -4.459

=== Accuracy of Optimal Weighted Ensemble ===
Accuracy: 0.968  (484/500)

Rows where optimized ensemble disagrees with reviewers:
    Paper ID Reviewers verdict ChatGPT verdict Claude verdict  \
2         13           include         exclude        exclude   
3         15           include         include        include   
7         45           include         include        exclude   
57       237           include         include        include   
69       279           include         exclude        exclude   

   DeepSeek verdict Gemini verdict Mistral verdict  
2           exclude        exclude         exclude  
3           include        include         include  
7           exclude        exclude         include  
57          exclude 

  X = df[llm_cols].applymap(lambda v: mapping.get(v, 0))
