In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch



from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
print("Imports dine")

2026-01-11 10:00:15.510491: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768125615.688796      31 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768125615.745437      31 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768125616.181256      31 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768125616.181281      31 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768125616.181284      31 computation_placer.cc:177] computation placer alr

Imports dine


In [3]:
DATA_DIR = "/kaggle/input/iit-kharagpur-hacathin-2026/Dataset"

train_df = pd.read_csv(f"{DATA_DIR}/train.csv")
test_df  = pd.read_csv(f"{DATA_DIR}/test.csv")

BOOK_DIR = f"{DATA_DIR}/Books"

print(len(train_df), len(test_df))

80 60


In [4]:
def normalize_book_name(name: str):
    return (
        name.lower()
        .replace("the ", "")
        .replace(" ", "_")
        .replace("-", "_")
        .strip()
    )

print("Chunck Okay")

Chunck Okay


# 5. Chunk Books

In [5]:
CHUNK_SIZE = 800
OVERLAP = 200

def chunk_text(text):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + CHUNK_SIZE
        chunks.append(" ".join(words[start:end]))
        start += CHUNK_SIZE - OVERLAP
    return chunks

books = {}

for fname in os.listdir(BOOK_DIR):
    raw_name = fname.replace(".txt", "")
    book_key = normalize_book_name(raw_name)

    with open(f"{BOOK_DIR}/{fname}", encoding="utf-8") as f:
        text = f.read()

    books[book_key] = chunk_text(text)

print("Indexed books:", books.keys())


Indexed books: dict_keys(['in_search_of_castaways', 'count_of_monte_cristo'])


# 7. Evidence Retriever (Embedding Index)

In [6]:
class EvidenceRetriever:
    def __init__(self):
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.embs = {}
        self.texts = {}

    def index(self, book_name, chunks):
        embs = self.model.encode(chunks, normalize_embeddings=True)
        self.embs[book_name] = embs
        self.texts[book_name] = chunks

    def retrieve(self, book_name, query, k=5):
        q = self.model.encode([query], normalize_embeddings=True)
        sims = np.dot(self.embs[book_name], q.T).squeeze()
        idx = sims.argsort()[-k:][::-1]
        return [self.texts[book_name][i] for i in idx]

retriever = EvidenceRetriever()

for book, chunks in books.items():
    retriever.index(book, chunks)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# 8. NLI Verifier

In [7]:
MODEL_NAME = "roberta-large-mnli"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

model.eval()
if torch.cuda.is_available():
    model.cuda()

LABELS = ["CONTRADICTION", "NEUTRAL", "ENTAILMENT"]

def check_claim(claim, evidence, max_length=512):
    encoded = tokenizer(
        claim,
        evidence,
        truncation="only_second",
        max_length=max_length,
        return_tensors="pt"
    )

    if torch.cuda.is_available():
        encoded = {k: v.cuda() for k, v in encoded.items()}

    with torch.no_grad():
        logits = model(**encoded).logits

    probs = torch.softmax(logits, dim=-1).squeeze()
    idx = torch.argmax(probs).item()

    return LABELS[idx], probs[idx].item()


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


---

# 9. K-FOLD CROSS-VALIDATION

In [8]:
def compute_score(claim, evidences):
    score = 0
    for ev in evidences:
        ev = shorten_evidence(ev)
        label, s = check_claim(claim, ev)

        if label == "CONTRADICTION" and s > 0.9:
            return -5.0  
        elif label == "CONTRADICTION":
            score -= 1.2 * s

        elif label == "ENTAILMENT":
            score += 1.2 * s

        elif label == "NEUTRAL":
            score -= 0.1 * s

    return score


def shorten_evidence(ev, max_words=200): 
    return " ".join(ev.split()[:max_words])

In [9]:
X = train_df
y = (train_df.label == "consistent").astype(int)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = []
acc_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n===== Fold {fold} =====")

    train_split = X.iloc[train_idx]
    val_split   = X.iloc[val_idx]

    train_scores = []

    for _, row in tqdm(train_split.iterrows(), total=len(train_split)):
        book_key = normalize_book_name(row.book_name)
        evidences = retriever.retrieve(book_key, row.content, k=2)  # ðŸ”¥ k=2
        score = compute_score(row.content, evidences)
        y_true = 1 if row.label == "consistent" else 0
        train_scores.append((score, y_true))

    train_scores_df = pd.DataFrame(train_scores, columns=["score", "label"])

    candidates = np.linspace(-3, 3, 61)
    best_f1, best_t = -1, 0

    for t in candidates:
        preds = (train_scores_df.score >= t).astype(int)
        f1 = f1_score(train_scores_df.label, preds)
        if f1 > best_f1:
            best_f1, best_t = f1, t

    THRESHOLD = best_t

    y_true, y_pred = [], []

    for _, row in tqdm(val_split.iterrows(), total=len(val_split)):
        book_key = normalize_book_name(row.book_name)
        evidences = retriever.retrieve(book_key, row.content, k=2)
        score = compute_score(row.content, evidences)

        pred = 0 if score < THRESHOLD else 1
        y_pred.append(pred)
        y_true.append(1 if row.label == "consistent" else 0)

    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred)

    acc_scores.append(acc)
    f1_scores.append(f1)

    print(f"Fold {fold} â†’ Accuracy: {acc:.4f}, F1: {f1:.4f}")



===== Fold 1 =====


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 64/64 [00:08<00:00,  7.43it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:02<00:00,  7.41it/s]


Fold 1 â†’ Accuracy: 0.6875, F1: 0.8148

===== Fold 2 =====


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 64/64 [00:08<00:00,  7.42it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:02<00:00,  7.35it/s]


Fold 2 â†’ Accuracy: 0.6250, F1: 0.7692

===== Fold 3 =====


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 64/64 [00:08<00:00,  7.25it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:02<00:00,  7.16it/s]


Fold 3 â†’ Accuracy: 0.6875, F1: 0.8000

===== Fold 4 =====


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 64/64 [00:09<00:00,  7.00it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:02<00:00,  6.95it/s]


Fold 4 â†’ Accuracy: 0.6250, F1: 0.7692

===== Fold 5 =====


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 64/64 [00:09<00:00,  6.85it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:02<00:00,  6.74it/s]

Fold 5 â†’ Accuracy: 0.6875, F1: 0.8000





# 10. CV RESULTS

In [10]:
print("\n===== FINAL CV RESULTS =====")
print("Mean Accuracy:", round(np.mean(acc_scores), 4))
print("Mean F1:", round(np.mean(f1_scores), 4))


===== FINAL CV RESULTS =====
Mean Accuracy: 0.6625
Mean F1: 0.7907


---

# 11. Train on Full Code

In [11]:
scores = []

for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    book_key = normalize_book_name(row.book_name)
    evidences = retriever.retrieve(book_key, row.content, k=2)
    score = compute_score(row.content, evidences)
    y_true = 1 if row.label == "consistent" else 0
    scores.append((score, y_true))

scores_df = pd.DataFrame(scores, columns=["score", "label"])

# F1-optimal threshold on full data
candidates = np.linspace(-3, 3, 61)
best_f1, best_t = -1, 0

for t in candidates:
    preds = (scores_df.score >= t).astype(int)
    f1 = f1_score(scores_df.label, preds)
    if f1 > best_f1:
        best_f1, best_t = f1, t

FINAL_THRESHOLD = best_t
print("FINAL THRESHOLD:", FINAL_THRESHOLD)


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 80/80 [00:12<00:00,  6.53it/s]


FINAL THRESHOLD: -3.0


# 12. Submit the Submission

In [12]:
predictions = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    book_key = normalize_book_name(row.book_name)

    evidences = retriever.retrieve(book_key, row.content, k=2)

    score = compute_score(row.content, evidences)

    pred = 0 if score < FINAL_THRESHOLD else 1
    predictions.append(pred)

submission = pd.DataFrame({
    "id": test_df.id,
    "prediction": predictions
})

submission.to_csv("saved_version5_f1score_0.7907.csv", index=False)
submission.head()

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 60/60 [00:09<00:00,  6.50it/s]


Unnamed: 0,id,prediction
0,95,1
1,136,1
2,59,1
3,60,1
4,124,1
