In [1]:
import sys
!{sys.executable} -m pip install -r requirements2.txt




In [2]:
import sys
import subprocess

# --- This is the only cell you need to run for installation ---

# Step 1: Install all dependencies from the final, corrected requirements file
# This will take a few minutes.
print("Starting installation... this may take a moment.")
!{sys.executable} -m pip install -r requirements2.txt

# Step 2: Download the required spaCy model
try:
    import spacy
    spacy.load('en_core_web_sm')
    print("\nspaCy model 'en_core_web_sm' is already installed.")
except OSError:
    print("\nDownloading spaCy model 'en_core_web_sm'...")
    subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)

print("\n✅ Installation and setup complete. You can now run the rest of your notebook.")

Starting installation... this may take a moment.


  from .autonotebook import tqdm as notebook_tqdm



spaCy model 'en_core_web_sm' is already installed.

✅ Installation and setup complete. You can now run the rest of your notebook.


In [3]:
import nltk
import spacy
import benepar
import stanza

nltk.download('wordnet', quiet=False)
nltk.download('verbnet', quiet=False)

spacy.load('en_core_web_sm')
benepar.download('benepar_en3')
stanza.download('en', logging_level='INFO')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nassim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package verbnet to
[nltk_data]     C:\Users\Nassim\AppData\Roaming\nltk_data...
[nltk_data]   Package verbnet is already up-to-date!
[nltk_data] Downloading package benepar_en3 to
[nltk_data]     C:\Users\Nassim\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json: 154kB [00:00, 66.3MB/s]                    
2025-11-26 09:49:18 INFO: Downloading default packages for language: en (English)...
2025-11-26 09:49:19 INFO: File exists: C:\Users\Nassim\stanza_resources\en\default.zip
2025-11-26 09:49:20 INFO: Finished downloading models and saved to C:\Users\Nassim\stanza_resources.


In [4]:
import pandas as pd
import spacy
import benepar
from tqdm import tqdm
import utils

tqdm.pandas()

# Pseudocode Step 1: Load spaCy language model
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nassim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package benepar_en3 to
[nltk_data]     C:\Users\Nassim\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json: 154kB [00:00, 36.6MB/s]                    
2025-11-26 09:49:21 INFO: Downloading default packages for language: en (English)...
2025-11-26 09:49:22 INFO: File exists: C:\Users\Nassim\stanza_resources\en\default.zip
2025-11-26 09:49:23 INFO: Finished downloading models and saved to C:\Users\Nassim\stanza_resources.
[nltk_data] Downloading package verbnet to
[nltk_data]     C:\Users\Nassim\AppData\Roaming\nltk_data...
[nltk_data]   Package verbnet is already up-to-date!


<benepar.integrations.spacy_plugin.BeneparComponent at 0x1de008262e0>

In [5]:
df = pd.read_csv('Anaphoric.csv')

# Identify the correct column for requirements
requirements_column = 'Raw Requirements'
if requirements_column not in df.columns:
    requirements_column = df.columns[0]
    print(f"Column 'Requirement' not found. Using the first column: '{requirements_column}'")

print(f"Dataset loaded. Initial shape: {df.shape}")

# Drop rows where the requirement text is missing (NaN)
df.dropna(subset=[requirements_column], inplace=True)

# Ensure all data in the column is of string type to prevent errors
df[requirements_column] = df[requirements_column].astype(str)

print(f"Cleaned dataset. Shape after removing empty rows: {df.shape}")
display(df.head())

# Pseudocode Step 2: Apply NLP to Data
df['Context_doc'] = df[requirements_column].progress_apply(lambda text: utils.applynlp(text, nlp))


Dataset loaded. Initial shape: (159, 10)
Cleaned dataset. Shape after removing empty rows: (159, 10)


Unnamed: 0,Domain,Raw Requirements,Manual Evaluation,Automated,ChatGPT,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 10
0,Game,The Game must be programmed using the C# langu...,UA,A,A,,,,,
1,Game,The User shall be able to the select username ...,UA,UA,UA,,,,,
2,Game,Username and Password must be available to ent...,A,UA,UA,,,,,
3,Game,The website shall enable the players to select...,A,UA,UA,,,,,
4,Game,"Each team players will have Names, Contacts nu...",UA,UA,UA,,,,,


100%|██████████| 159/159 [00:05<00:00, 28.94it/s]


In [6]:
# Pseudocode Step 3: Define Pronouns
pronouns_list = [
    "i", "me", "my", "mine", "myself", "you", "your", "yours", "yourself",
    "he", "him", "his", "himself", "she", "her", "hers", "herself",
    "it", "its", "itself", "we", "us", "our", "ours", "ourselves",
    "they", "them", "their", "theirs", "themselves"
]

# Pseudocode Step 4: Initialize Data Structures
processed_triples = []
ids_used = set()  # Use a set for fast checking of uniqueness

print("Generating (Pronoun, Candidate Antecedent) pairs...")

# Pseudocode Step 6: For each unique context, find pronouns and antecedents
for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="Generating Triples"):
    doc = row['Context_doc']
    
    if not isinstance(doc, spacy.tokens.Doc):
        continue

    pronouns_in_doc = utils.findPronouns(doc, pronouns_list)
    
    for j, pronoun in enumerate(pronouns_in_doc):
        # Generate a unique ID for each pronoun-context pair
        base_id = f"{i}-{pronoun.text.lower()}-{j}"
        
        # Ensure ID is unique
        k = 0
        pronoun_id = base_id
        while pronoun_id in ids_used:
            k += 1
            pronoun_id = f"{base_id}-{k}"
        ids_used.add(pronoun_id)
        
        candidate_antecedents = utils.getNPs(doc, pronoun)
        
        for candidate in candidate_antecedents:
            # Append data to list
            processed_triples.append([
                pronoun_id,
                doc,
                pronoun,
                pronoun.i,
                candidate,
                row['Manual Evaluation']
            ])

# Pseudocode Step 7: Create DataFrame from Results
triples_df = pd.DataFrame(processed_triples, columns=[
    "Id", "Context", "Pronoun", "Position", 
    "Candidate_Antecedent", "Manual Evaluation"
])

print(f"Generated {len(triples_df)} pronoun-antecedent pairs.")
display(triples_df.head())

Generating (Pronoun, Candidate Antecedent) pairs...


Generating Triples: 100%|██████████| 159/159 [00:00<00:00, 12963.50it/s]

Generated 461 pronoun-antecedent pairs.





Unnamed: 0,Id,Context,Pronoun,Position,Candidate_Antecedent,Manual Evaluation
0,0-its-0,"(The, Game, must, be, programmed, using, the, ...",its,11,"(The, Game)",UA
1,0-its-0,"(The, Game, must, be, programmed, using, the, ...",its,11,"(the, C, #, language)",UA
2,0-its-0,"(The, Game, must, be, programmed, using, the, ...",its,11,"(the, C, #, language, and, its, libraries)",UA
3,1-their-0,"(The, User, shall, be, able, to, the, select, ...",their,12,"(The, User)",UA
4,2-their-0,"(Username, and, Password, must, be, available,...",their,16,(Username),A


In [7]:
# Make a copy of the original triples DataFrame to avoid modifying it directly
final_df = triples_df.copy()

# Create a dictionary to store the modified context with hashed pronouns
hashed_context_map = {}

# Group the DataFrame by pronoun ID to process each pronoun instance separately
for pronoun_id, group in final_df.groupby('Id'):
    # Take the first row of the group (all rows share the same context and pronoun)
    first_row = group.iloc[0]
    pronoun_token = first_row['Pronoun']
    context_doc = first_row['Context']
    
    # Create a hashed version of the pronoun using its text and index
    hashed_pronoun = f"{pronoun_token.text}#{pronoun_id.split('-')[-1]}"

    # Replace the original pronoun in the context with the hashed version
    hashed_context_map[pronoun_id] = (
        context_doc[:pronoun_token.i].text_with_ws + 
        hashed_pronoun + 
        context_doc[pronoun_token.i + 1:].text_with_ws
    )

# Add the new hashed context to the DataFrame
final_df['Hashed_Context'] = final_df['Id'].map(hashed_context_map)

# Convert spaCy token and span objects to plain text for serialization
final_df['Pronoun'] = final_df['Pronoun'].apply(lambda token: token.text)
final_df['Candidate_Antecedent'] = final_df['Candidate_Antecedent'].apply(lambda span: span.text)

# Remove the original 'Context' column since it's no longer needed
final_df.drop(columns=['Context'], inplace=True)

# Reorder and select the final columns for SpanBERT input
final_df = final_df[[
    'Id', 'Hashed_Context', 'Pronoun', 'Position', 
    'Candidate_Antecedent', 'Manual Evaluation'
]]

# Save file
output_filename = 'anaphoric_ambiguity_spanbert_input.csv'
final_df.to_csv(output_filename, index=False)

display(final_df.head())

Unnamed: 0,Id,Hashed_Context,Pronoun,Position,Candidate_Antecedent,Manual Evaluation
0,0-its-0,The Game must be programmed using the C# langu...,its,11,The Game,UA
1,0-its-0,The Game must be programmed using the C# langu...,its,11,the C# language,UA
2,0-its-0,The Game must be programmed using the C# langu...,its,11,the C# language and its libraries,UA
3,1-their-0,The User shall be able to the select username ...,their,12,The User,UA
4,2-their-0,Username and Password must be available to ent...,their,16,Username,A


In [11]:
import pandas as pd
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
from tqdm import tqdm

# Initialize SRL predictor
print("Initializing SRL predictor...")
srl_predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz"
)
print("SRL predictor initialized successfully.")

# SRL Feature Extraction Functions
def get_span_role(tags, span):
    """Get the semantic role of a span, prioritizing B- tags."""
    start, end = span
    if start < 0 or start >= len(tags) or end >= len(tags):
        return 'O'
    
    span_tags = tags[start:end+1]
    
    for tag in span_tags:
        if tag.startswith('B-'):
            return tag.split('-', 1)[1]
    
    for tag in span_tags:
        if tag.startswith('I-'):
            return tag.split('-', 1)[1]
    
    return 'O'

def _find_verb_index(tags, tokens, verb):
    """Find the index of the verb in the token list."""
    for i, tag in enumerate(tags):
        if tag.startswith('B-V'):
            return i
    
    try:
        return [tok.lower() for tok in tokens].index(verb.lower())
    except ValueError:
        return None

def extract_srl_features(sentence, pronoun_span, antecedent_span):
    """
    Extract a single, definitive set of SRL features for a pronoun-antecedent pair,
    prioritizing shared predicate frames to ensure one result per pair.
    """
    try:
        result = srl_predictor.predict(sentence=sentence)
        tokens = result.get('words', [])
        
        max_idx = len(tokens) - 1
        if not tokens or pronoun_span[0] > max_idx or antecedent_span[0] > max_idx:
            return (None, 'O', 'O', 0, -1)

        # --- PASS 1: Prioritize a shared predicate frame ---
        for pred in result.get('verbs', []):
            tags = pred['tags']
            pronoun_role = get_span_role(tags, pronoun_span)
            antecedent_role = get_span_role(tags, antecedent_span)
            
            # If both have a role in the SAME frame, this is the best-case scenario.
            # Calculate features and return immediately.
            if pronoun_role != 'O' and antecedent_role != 'O':
                predicate = pred['verb']
                core_args = {'ARG0', 'ARG1', 'ARG2', 'ARG3', 'ARG4', 'ARG5'}
                compatible_roles = (pronoun_role == antecedent_role) or (pronoun_role in core_args and antecedent_role in core_args)
                srl_match = int(compatible_roles)
                return (predicate, antecedent_role, pronoun_role, srl_match, 0)

        # --- PASS 2: Fallback if no shared frame was found ---
        # Find the first role for each span independently.
        pronoun_info = {'role': 'O', 'predicate_idx': None, 'predicate': None}
        antecedent_info = {'role': 'O', 'predicate_idx': None, 'predicate': None}

        for pred in result.get('verbs', []):
            verb, tags = pred['verb'], pred['tags']
            verb_idx = _find_verb_index(tags, tokens, verb)

            if pronoun_info['role'] == 'O':
                role = get_span_role(tags, pronoun_span)
                if role != 'O':
                    pronoun_info = {'role': role, 'predicate_idx': verb_idx, 'predicate': verb}

            if antecedent_info['role'] == 'O':
                role = get_span_role(tags, antecedent_span)
                if role != 'O':
                    antecedent_info = {'role': role, 'predicate_idx': verb_idx, 'predicate': verb}
        
        # Calculate features based on potentially different predicate info
        same_predicate = (pronoun_info['predicate'] == antecedent_info['predicate'] and pronoun_info['predicate'] is not None)
        core_args = {'ARG0', 'ARG1', 'ARG2', 'ARG3', 'ARG4', 'ARG5'}
        compatible_roles = (pronoun_info['role'] == antecedent_info['role'] and pronoun_info['role'] != 'O') or \
                           (pronoun_info['role'] in core_args and antecedent_info['role'] in core_args)
        srl_match = int(same_predicate or compatible_roles)
        
        if pronoun_info['predicate_idx'] is None or antecedent_info['predicate_idx'] is None:
            predicate_distance = -1
        else:
            predicate_distance = abs(pronoun_info['predicate_idx'] - antecedent_info['predicate_idx'])
        
        predicate = pronoun_info['predicate'] or antecedent_info['predicate']

        return (predicate, antecedent_info['role'], pronoun_info['role'], srl_match, predicate_distance)
        
    except Exception as e:
        print(f"Error processing sentence: '{sentence[:50]}...' | Error: {e}")
        return (None, 'O', 'O', 0, -1)

def extract_all_srl_features(df):
    """Extract SRL features for all rows in the dataframe, ensuring one output row per input row."""
    srl_features = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting SRL Features"):
        try:
            sentence = row['Context'].text
            pronoun_span = (row['Pronoun'].i, row['Pronoun'].i)
            antecedent_span = (row['Candidate_Antecedent'].start, row['Candidate_Antecedent'].end - 1)
            
            # This call now guarantees a single tuple of results
            features = extract_srl_features(sentence, pronoun_span, antecedent_span)
            
            srl_features.append({
                'Id': row['Id'],
                'Candidate_Antecedent_Text': row['Candidate_Antecedent'].text, # Add for easier merging
                'Predicate': features[0],
                'Antecedent_Role': features[1],
                'Pronoun_Role': features[2],
                'SRL_Match': features[3],
                'Predicate_Distance': features[4]
            })
            
        except Exception as e:
            print(f"Critical error at row {idx}: {e}")
            # Append a failure record
            srl_features.append({'Id': row['Id'], 'Candidate_Antecedent_Text': row['Candidate_Antecedent'].text, 'Predicate': 'ERROR'})
    
    return pd.DataFrame(srl_features)

# ============================================================
# MAIN EXECUTION
# ============================================================

print("Extracting SRL features from triples_df (with spaCy objects)...")
srl_features_df = extract_all_srl_features(triples_df)
print("SRL features extracted successfully.")

# Merge using a composite key to ensure correct alignment
final_df['Candidate_Antecedent_Text'] = final_df['Candidate_Antecedent']
final_df_with_srl = pd.merge(final_df, srl_features_df, 
                             on=['Id', 'Candidate_Antecedent_Text'], 
                             how='left').drop(columns=['Candidate_Antecedent_Text'])

# Fill any potential missing values
final_df_with_srl.fillna({
    'Predicate': 'None', 'Antecedent_Role': 'O', 'Pronoun_Role': 'O',
    'SRL_Match': 0, 'Predicate_Distance': -1
}, inplace=True)

# Convert numeric columns to proper types
final_df_with_srl['SRL_Match'] = final_df_with_srl['SRL_Match'].astype(int)
final_df_with_srl['Predicate_Distance'] = final_df_with_srl['Predicate_Distance'].astype(int)

print(f"Final dataset shape: {final_df_with_srl.shape}")
print("\nFirst few rows:")
display(final_df_with_srl.head(10))

# Show summary statistics
print("\nSRL Feature Statistics:")
print(f"Unique predicates: {final_df_with_srl['Predicate'].nunique()}")
print(f"SRL_Match distribution:\n{final_df_with_srl['SRL_Match'].value_counts(dropna=False)}")
print(f"Antecedent roles:\n{final_df_with_srl['Antecedent_Role'].value_counts(dropna=False).head(10)}")
print(f"Pronoun roles:\n{final_df_with_srl['Pronoun_Role'].value_counts(dropna=False).head(10)}")

# Save the final, correct dataset
output_filename = 'anaphoric_ambiguity_srl_final.csv'
final_df_with_srl.to_csv(output_filename, index=False)
print(f"\n✓ Saved final correct data to {output_filename}")

Initializing SRL predictor...


error loading _jsonnet (this is expected on Windows), treating C:\Users\Nassim\AppData\Local\Temp\tmpkn52b4de\config.json as plain json
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SRL predictor initialized successfully.
Extracting SRL features from triples_df (with spaCy objects)...


Extracting SRL Features: 100%|██████████| 461/461 [01:02<00:00,  7.39it/s]

SRL features extracted successfully.
Final dataset shape: (461, 11)

First few rows:





Unnamed: 0,Id,Hashed_Context,Pronoun,Position,Candidate_Antecedent,Manual Evaluation,Predicate,Antecedent_Role,Pronoun_Role,SRL_Match,Predicate_Distance
0,0-its-0,The Game must be programmed using the C# langu...,its,11,The Game,UA,programmed,ARG1,ARGM-MNR,0,0
1,0-its-0,The Game must be programmed using the C# langu...,its,11,the C# language,UA,programmed,ARGM-MNR,ARGM-MNR,1,0
2,0-its-0,The Game must be programmed using the C# langu...,its,11,the C# language and its libraries,UA,programmed,ARGM-MNR,ARGM-MNR,1,0
3,1-their-0,The User shall be able to the select username ...,their,12,The User,UA,be,ARG1,ARG2,1,0
4,2-their-0,Username and Password must be available to ent...,their,16,Username,A,be,ARG1,ARG2,1,0
5,2-their-0,Username and Password must be available to ent...,their,16,Username and Password,A,be,ARG1,ARG2,1,0
6,2-their-0,Username and Password must be available to ent...,their,16,Password,A,be,ARG1,ARG2,1,0
7,2-their-0,Username and Password must be available to ent...,their,16,the system,A,be,ARG2,ARG2,1,0
8,2-their-0,Username and Password must be available to ent...,their,16,order,A,be,ARG2,ARG2,1,0
9,3-their-0,The website shall enable the players to select...,their,8,The website,A,enable,ARG0,ARG1,1,0



SRL Feature Statistics:
Unique predicates: 51
SRL_Match distribution:
SRL_Match
1    354
0    107
Name: count, dtype: int64
Antecedent roles:
Antecedent_Role
ARG1        222
ARG0        140
ARG2         63
ARGM-PRP     11
ARGM-ADV      8
ARGM-TMP      6
ARGM-MNR      4
O             4
ARGM-MOD      1
ARGM-LOC      1
Name: count, dtype: int64
Pronoun roles:
Pronoun_Role
ARG1        309
ARG2         36
ARGM-PRP     26
ARGM-ADV     17
ARGM-MNR     14
ARGM-LOC     14
ARGM-TMP     13
ARG0         13
C-ARG1       10
ARGM-DIR      9
Name: count, dtype: int64

✓ Saved final correct data to anaphoric_ambiguity_srl_final.csv
