# Milestone 2: Model Evaluation & Ranking Generation
**M·ª•c ti√™u:**
1. ƒê·ªçc d·ªØ li·ªáu `labels.json` (ch·ª©a input BibTeX v√† Ground Truth).
2. T√°i c·∫•u tr√∫c d·ªØ li·ªáu theo t·ª´ng b√†i b√°o (Paper-based grouping).
3. S·ª≠ d·ª•ng m√¥ h√¨nh ƒë√£ hu·∫•n luy·ªán ƒë·ªÉ x·∫øp h·∫°ng (Ranking) c√°c reference candidates.
4. T√≠nh ch·ªâ s·ªë **MRR (Mean Reciprocal Rank)**.
5. Xu·∫•t file `pred.json` theo ƒë√∫ng ƒë·ªãnh d·∫°ng y√™u c·∫ßu n·ªôp b√†i.

**Y√™u c·∫ßu ƒë·∫ßu v√†o:**
* `labels.json`: File d·ªØ li·ªáu test.
* `models/best_matcher.pkl`: Model ƒë√£ train.
* `models/feature_names.pkl`: List t√™n features t∆∞∆°ng ·ª©ng.


In [12]:
import pandas as pd
import numpy as np
import json
import os
import re
import joblib
from tqdm import tqdm
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# C√†i ƒë·∫∑t hi·ªÉn th·ªã
pd.set_option('display.max_columns', None)
print("Libraries imported successfully!")


Libraries imported successfully!


In [13]:
# --- C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N ---
# H√£y thay ƒë·ªïi ƒë∆∞·ªùng d·∫´n ph√π h·ª£p v·ªõi m√¥i tr∆∞·ªùng c·ªßa b·∫°n
TEST_FILE_PATH = '../../dataset_final/test/labels.json'           # File d·ªØ li·ªáu test
MODEL_PATH = '../../dataset_final/models/best_matcher.pkl'   # File model (ƒë√£ train ·ªü b∆∞·ªõc tr∆∞·ªõc)
FEATURE_NAME_PATH = '../../dataset_final/models/feature_names.pkl' # Danh s√°ch features
OUTPUT_DIR = 'submission_output'         # Th∆∞ m·ª•c ch·ª©a k·∫øt qu·∫£ pred.json

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"Created output directory: {OUTPUT_DIR}")


In [None]:

def normalize_text_basic(text):
    """Chu·∫©n h√≥a c∆° b·∫£n ƒë·ªÉ t√≠nh to√°n kho·∫£ng c√°ch"""
    if not isinstance(text, str): return ""
    return str(text).lower().strip()

def get_tokens(text_list_or_str):
    """Chuy·ªÉn ƒë·ªïi text ho·∫∑c list text th√†nh set c√°c t·ª´ ƒë∆°n (tokens)"""
    if isinstance(text_list_or_str, list):
        text = " ".join([str(t) for t in text_list_or_str])
    else:
        text = str(text_list_or_str)
    
    # B·ªè d·∫•u c√¢u, gi·ªØ l·∫°i ch·ªØ s·ªë v√† ch·ªØ c√°i
    text = re.sub(r'[^\w\s]', '', text.lower())
    return set(text.split())

def safe_year_diff(y1, y2):
    """T√≠nh kho·∫£ng c√°ch nƒÉm, x·ª≠ l√Ω l·ªói n·∫øu thi·∫øu d·ªØ li·ªáu"""
    try:
        # L·∫•y 4 s·ªë ƒë·∫ßu ti√™n t√¨m th·∫•y l√†m nƒÉm
        m1 = re.search(r'\d{4}', str(y1))
        m2 = re.search(r'\d{4}', str(y2))
        
        if m1 and m2:
            val1 = int(m1.group(0))
            val2 = int(m2.group(0))
            diff = abs(val1 - val2)
            # Clip kho·∫£ng c√°ch ƒë·ªÉ tr√°nh outlier qu√° l·ªõn (v√≠ d·ª• sai s·ªë 100 nƒÉm)
            return min(diff, 10) 
        return -1 # Gi√° tr·ªã missing indicator
    except:
        return -1

## 1. Feature Engineering
**L∆∞u √Ω quan tr·ªçng:** C√°c h√†m d∆∞·ªõi ƒë√¢y ph·∫£i **gi·ªëng h·ªát** logic b·∫°n ƒë√£ s·ª≠ d·ª•ng khi hu·∫•n luy·ªán m√¥ h√¨nh (Training Phase). N·∫øu thay ƒë·ªïi, model s·∫Ω nh·∫≠n di·ªán sai ƒë·∫∑c tr∆∞ng.


In [None]:
# --- Helper: Text Processing ---
def normalize_text(text):
    if pd.isna(text) or text is None: return ""
    text = str(text).lower().strip()
    return text

def get_tokens(text_list_or_str):
    if isinstance(text_list_or_str, list):
        text = " ".join([str(t) for t in text_list_or_str])
    else:
        text = str(text_list_or_str)
    text = re.sub(r'[^\w\s]', '', text.lower())
    return set(text.split())
def parse_bibtex_content(bib_content):
    """
    Tr√≠ch xu·∫•t Title v√† Authors t·ª´ chu·ªói BibTeX raw.
    ƒê√£ s·ª≠a l·ªói Regex pattern error.
    """
    # Extract Title
    # S·ª¨A L·ªñI: Thay (?<!\) th√†nh (?<!\\)
    title_match = re.search(r'title\s*=\s*[\{"](.*?)(?<!\\)[\}"]', bib_content, re.I | re.S)
    title = title_match.group(1) if title_match else ""
    title = normalize_text(title)
    
    # Extract Authors
    # S·ª¨A L·ªñI: T∆∞∆°ng t·ª± cho ph·∫ßn authors
    author_match = re.search(r'author\s*=\s*[\{"](.*?)(?<!\\)[\}"]', bib_content, re.I | re.S)
    authors_str = author_match.group(1) if author_match else ""
    # T√°ch author ƒë∆°n gi·∫£n b·∫±ng 'and'
    authors = [normalize_text(a) for a in authors_str.split(' and ')] if authors_str else []
    
    return title, authors
def compute_pairwise_features(row):
    feats = {}
    
    # --- A. UNPACK DATA ---
    q_tit = normalize_text_basic(row.get('bib_title', ''))
    q_auth_list = row.get('bib_authors', [])
    q_id = normalize_text_basic(row.get('bib_id', ''))
    # L·∫•y nƒÉm v√† √©p ki·ªÉu v·ªÅ string ƒë·ªÉ x·ª≠ l√Ω an to√†n
    q_year = str(row.get('bib_year', ''))
    
    c_tit = normalize_text_basic(row.get('cand_title', ''))
    c_auth_list = row.get('cand_authors', [])
    c_id = normalize_text_basic(row.get('cand_id', ''))
    c_year = str(row.get('cand_year', '')) # Ground truth year
    
    # Chuy·ªÉn list author th√†nh string ƒë·ªÉ d√πng fuzzy match
    q_auth_str = " ".join(q_auth_list) if isinstance(q_auth_list, list) else str(q_auth_list)
    c_auth_str = " ".join(c_auth_list) if isinstance(c_auth_list, list) else str(c_auth_list)

    # --- B. ID FEATURES (GOLDEN FEATURE) ---
    id_score = 0.0
    if q_id and c_id:
        clean_q = re.sub(r'[^a-z0-9]', '', q_id)
        clean_c = re.sub(r'[^a-z0-9]', '', c_id)
        if clean_q == clean_c: id_score = 1.0
        elif clean_q in clean_c or clean_c in clean_q: id_score = 0.8
    feats['feat_id_match'] = id_score

    # --- C. TITLE FEATURES ---
    feats['feat_title_fuzzy'] = fuzz.ratio(q_tit, c_tit) / 100.0
    feats['feat_title_sort'] = fuzz.token_sort_ratio(q_tit, c_tit) / 100.0
    feats['feat_title_partial'] = fuzz.partial_ratio(q_tit, c_tit) / 100.0
    
    # Feature m·ªõi: Ki·ªÉm tra xem Title query c√≥ n·∫±m tr·ªçn trong Candidate kh√¥ng (v√† ng∆∞·ª£c l·∫°i)
    # Gi√∫p b·∫Øt tr∆∞·ªùng h·ª£p BibTeX vi·∫øt t·∫Øt
    feats['feat_title_contain'] = 1.0 if (q_tit and c_tit and (q_tit in c_tit or c_tit in q_tit)) else 0.0

    len_q = len(q_tit)
    len_c = len(c_tit)
    feats['feat_title_len_diff'] = abs(len_q - len_c) / max(len_q, len_c, 1) # tr√°nh chia 0

    # --- D. AUTHOR FEATURES (N√ÇNG C·∫§P) ---
    q_tokens = get_tokens(q_auth_list)
    c_tokens = get_tokens(c_auth_list)
    
    # 1. Jaccard (Gi·ªØ nguy√™n)
    if q_tokens and c_tokens:
        inter = len(q_tokens.intersection(c_tokens))
        union = len(q_tokens.union(c_tokens))
        feats['feat_auth_jaccard'] = inter / union
        feats['feat_auth_overlap'] = inter
    else:
        feats['feat_auth_jaccard'] = 0.0
        feats['feat_auth_overlap'] = 0
        
    # 2. Author Fuzzy Sort (M·ªöI - R·∫•t quan tr·ªçng)
    # Gi√∫p x·ª≠ l√Ω: "Bengio, Y." vs "Yoshua Bengio"
    feats['feat_auth_token_sort'] = fuzz.token_sort_ratio(q_auth_str, c_auth_str) / 100.0
        
    # 3. First Author Match
    try:
        a1_q = str(q_auth_list[0]).split()[0].lower() if len(q_auth_list) > 0 else ""
        a1_c = str(c_auth_list[0]).split()[0].lower() if len(c_auth_list) > 0 else ""
        # So s√°nh 3 k√Ω t·ª± ƒë·∫ßu
        feats['feat_first_auth_match'] = 1.0 if (a1_q and a1_c and a1_q[:3] == a1_c[:3]) else 0.0
    except:
        feats['feat_first_auth_match'] = 0.0
        
    # --- E. YEAR FEATURE (UNCOMMENT & FIX) ---
    # NƒÉm l√† feature c·ª±c m·∫°nh ƒë·ªÉ l·ªçc nhi·ªÖu
    feats['feat_year_diff'] = safe_year_diff(q_year, c_year)
    # Th√™m feature binary: C√≥ kh·ªõp nƒÉm ch√≠nh x√°c kh√¥ng?
    feats['feat_year_match'] = 1.0 if (feats['feat_year_diff'] == 0) else 0.0

    return feats

## 2. Load & Transform Data
Ch√∫ng ta c·∫ßn chuy·ªÉn ƒë·ªïi d·ªØ li·ªáu t·ª´ danh s√°ch ph·∫≥ng (flat list) trong `manual.json` sang c·∫•u tr√∫c **Paper-based**.
* M·ªói `source_paper_id` s·∫Ω l√† m·ªôt nh√≥m.
* T·∫≠p **Candidates** c·ªßa nh√≥m ƒë√≥ l√† t·∫≠p h·ª£p t·∫•t c·∫£ c√°c `ground_truth` unique xu·∫•t hi·ªán trong paper.


In [15]:
# Load d·ªØ li·ªáu
try:
    with open(TEST_FILE_PATH, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    print(f"Loaded {len(raw_data)} entries from {TEST_FILE_PATH}")
except FileNotFoundError:
    print(f"‚ùå Error: File {TEST_FILE_PATH} not found.")
    raw_data = []

# Grouping logic
papers_db = defaultdict(lambda: {'queries': [], 'candidates': {}})

for item in raw_data:
    paper_id = item.get('source_paper_id', 'unknown_paper')
    
    # 1. Input Query (BibTeX)
    bib_key = item.get('key')
    bib_content = item.get('content')
    q_title, q_authors = parse_bibtex_content(bib_content)
    
    query_obj = {
        'key': bib_key,
        'bib_title': q_title,
        'bib_authors': q_authors,
        'true_id': item.get('ground_truth', {}).get('id')
    }
    papers_db[paper_id]['queries'].append(query_obj)
    
    # 2. Candidate Pool (References)
    gt = item.get('ground_truth', {})
    cand_id = gt.get('id')
    
    if cand_id and cand_id not in papers_db[paper_id]['candidates']:
        cand_authors = gt.get('authors', [])
        papers_db[paper_id]['candidates'][cand_id] = {
            'cand_id': cand_id,
            'cand_title': normalize_text(gt.get('title', '')),
            'cand_authors': [normalize_text(a) for a in cand_authors]
        }

print(f"‚úÖ Data transformed into {len(papers_db)} papers.")


Loaded 812 entries from ../../dataset_final/test/labels.json
‚úÖ Data transformed into 39 papers.


## 3. Load Trained Model
Load model SVM/RandomForest/XGBoost ƒë√£ l∆∞u t·ª´ b∆∞·ªõc Training.


In [16]:
try:
    model = joblib.load(MODEL_PATH)
    feature_names = joblib.load(FEATURE_NAME_PATH)
    print(f"‚úÖ Model loaded: {type(model).__name__}")
    print(f"üìù Expected features: {len(feature_names)}")
except FileNotFoundError:
    print("‚ö†Ô∏è WARNING: Model file not found. Code will run with RANDOM SCORES for demonstration.")
    model = None
    feature_names = []


‚úÖ Model loaded: XGBClassifier
üìù Expected features: 9


## 4. Ranking Pipeline & Prediction
V·ªõi m·ªói b√†i b√°o:
1. T·∫°o c·∫∑p (Query, Candidate) cho t·∫•t c·∫£ ·ª©ng vi√™n.
2. T√≠nh feature.
3. D·ª± ƒëo√°n x√°c su·∫•t match.
4. L·∫•y Top 5 ·ª©ng vi√™n c√≥ ƒëi·ªÉm cao nh·∫•t.


In [17]:
global_mrr_sum = 0
global_query_count = 0

print("üöÄ Starting Ranking Pipeline...")

for paper_id, data in papers_db.items():
    queries = data['queries']
    candidates_dict = data['candidates']
    candidates_list = list(candidates_dict.values())
    
    if not candidates_list:
        continue

    # Init Output Structure
    submission_data = {
        "partition": "test", 
        "groundtruth": {},
        "prediction": {}
    }
    
    # Loop qua t·ª´ng query trong paper
    for query in tqdm(queries, desc=f"Paper {paper_id}", leave=False):
        bib_key = query['key']
        true_id = query['true_id']
        
        # Groundtruth
        submission_data['groundtruth'][bib_key] = true_id
        
        # Pairing & Feature Calc
        pairs = []
        for cand in candidates_list:
            row = {}
            row.update(query)
            row.update(cand)
            pairs.append(row)
            
        # Compute Features
        feats_list = [compute_pairwise_features(p) for p in pairs]
        df_feats = pd.DataFrame(feats_list)
        
        # Predict
        scores = []
        if model:
            # Ensure columns exist
            for col in feature_names:
                if col not in df_feats.columns: df_feats[col] = 0.0
            
            X_input = df_feats[feature_names]
            scores = model.predict_proba(X_input)[:, 1]
        else:
            scores = np.random.rand(len(pairs)) # Fallback
            
        # Ranking
        ranked_candidates = []
        for idx, score in enumerate(scores):
            ranked_candidates.append((candidates_list[idx]['cand_id'], score))
        
        ranked_candidates.sort(key=lambda x: x[1], reverse=True)
        top_5 = [x[0] for x in ranked_candidates[:5]]
        
        # Save Prediction
        submission_data['prediction'][bib_key] = top_5
        
        # Calc MRR
        if true_id in top_5:
            rank = top_5.index(true_id) + 1
            global_mrr_sum += 1.0 / rank
        else:
            global_mrr_sum += 0.0
            
        global_query_count += 1
        
    # Save JSON Output
    # T√™n file pred.json n√™n ƒë·∫∑t trong folder ri√™ng c·ªßa student/paper n·∫øu n·ªôp th·∫≠t
    # ·ªû ƒë√¢y l∆∞u d·∫°ng prefix_pred.json ƒë·ªÉ d·ªÖ ki·ªÉm tra
    safe_pid = str(paper_id).replace('/', '_')
    save_path = os.path.join(OUTPUT_DIR, f"{safe_pid}_pred.json")
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(submission_data, f, indent=4)

print("‚úÖ Ranking completed for all papers.")


üöÄ Starting Ranking Pipeline...


                                                                  

‚úÖ Ranking completed for all papers.




## 5. Evaluation Report (Nh·∫≠n x√©t)


In [18]:
final_mrr = global_mrr_sum / global_query_count if global_query_count > 0 else 0

print("="*40)
print("üìä FINAL EVALUATION REPORT")
print("="*40)
print(f"Papers Processed: {len(papers_db)}")
print(f"Total Queries:    {global_query_count}")
print(f"Metric MRR:       {final_mrr:.4f}")
print("="*40)

if final_mrr > 0.8:
    print("üåü Nh·∫≠n x√©t: Xu·∫•t s·∫Øc. Model x·∫øp h·∫°ng reference r·∫•t ch√≠nh x√°c.")
elif final_mrr > 0.5:
    print("üëç Nh·∫≠n x√©t: Kh√°. Model ƒë∆∞a ra ·ª©ng vi√™n ƒë√∫ng trong top 5 th∆∞·ªùng xuy√™n.")
else:
    print("‚ö†Ô∏è Nh·∫≠n x√©t: C·∫ßn c·∫£i thi·ªán. Ki·ªÉm tra l·∫°i Features ho·∫∑c Model.")

print(f"\nüìÅ File k·∫øt qu·∫£ (pred.json) l∆∞u t·∫°i: {os.path.abspath(OUTPUT_DIR)}")


üìä FINAL EVALUATION REPORT
Papers Processed: 39
Total Queries:    812
Metric MRR:       0.2893
‚ö†Ô∏è Nh·∫≠n x√©t: C·∫ßn c·∫£i thi·ªán. Ki·ªÉm tra l·∫°i Features ho·∫∑c Model.

üìÅ File k·∫øt qu·∫£ (pred.json) l∆∞u t·∫°i: d:\Coding\School\Y3-K1\Intro2DS\DS - LAB 2\Milestone2_Project\23127011\notebooks\submission_output
