
# 🐱 Waifu Predictor — Notebook MVP (CSV-driven)

This notebook uses **`data.csv`** (25 characters × 20 numeric traits + `summary`) to run an Akinator-style waifu predictor.
- We use **trait-based elimination** for ~10 questions.
- Optional extension cell (at the end) shows how to plug in **embeddings** later (Ollama or TF‑IDF).

> Dataset expectations: columns = `name`, 20 numeric trait columns (e.g., *Agency, Resilience, …, Altruism*), and `summary`.


In [1]:

# 1) Setup & Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load your dataset (already uploaded as /mnt/data/data.csv in this environment)
DATA_PATH = "C:/Users/wadhw/OneDrive/Desktop/CourseAndCoding/best-girl/data.csv"
df = pd.read_csv(DATA_PATH)

print(df.shape)
df.head(3)


(25, 22)


Unnamed: 0,name,Agency,Resilience,Nurturance,Assertiveness,Intellect,Combat Prowess,Emotional Stability,Loyalty,Cynicism,...,Adaptability,Self-Esteem,Empathy,Impulsiveness,Discipline,Ambition,Social Acuity,Independence,Altruism,summary
0,Asuna Yuuki,75,85,80,88,70,95,65,90,20,...,98,70,80,40,92,75,85,60,85,Known as 'The Flash' for her blinding sword sp...
1,Rem,40,70,98,30,60,85,35,100,40,...,65,10,95,75,80,20,70,5,95,A gentle and capable maid who is secretly a po...
2,Hinata Hyuga,60,80,90,55,65,75,75,95,5,...,60,65,98,20,85,50,40,45,90,"The timid heiress of the Hyuga clan, Hinata wa..."


In [2]:

# 2) Identify schema (traits, id, description)
name_col_candidates = [c for c in df.columns if c.lower() == "name"]
summary_col_candidates = [c for c in df.columns if c.lower() == "summary"]

if not name_col_candidates or not summary_col_candidates:
    raise ValueError("Expected 'name' and 'summary' columns to exist. Found: "
                     f"name={name_col_candidates}, summary={summary_col_candidates}")

NAME_COL = name_col_candidates[0]
SUMMARY_COL = summary_col_candidates[0]

trait_cols = df.select_dtypes(include=np.number).columns.tolist()
if len(trait_cols) < 5:
    raise ValueError(f"Expected many numeric trait columns; found {len(trait_cols)}: {trait_cols}")

print("Name column:", NAME_COL)
print("Summary column:", SUMMARY_COL)
print("Trait columns:", trait_cols)


Name column: name
Summary column: summary
Trait columns: ['Agency', 'Resilience', 'Nurturance', 'Assertiveness', 'Intellect', 'Combat Prowess', 'Emotional Stability', 'Loyalty', 'Cynicism', 'Optimism', 'Perseverance', 'Adaptability', 'Self-Esteem', 'Empathy', 'Impulsiveness', 'Discipline', 'Ambition', 'Social Acuity', 'Independence', 'Altruism']


In [3]:

# 3) Normalize traits to [0,1] for fair comparisons
scaler = MinMaxScaler()
traits_norm = pd.DataFrame(scaler.fit_transform(df[trait_cols]), columns=trait_cols, index=df.index)
traits_norm.head(3)


Unnamed: 0,Agency,Resilience,Nurturance,Assertiveness,Intellect,Combat Prowess,Emotional Stability,Loyalty,Cynicism,Optimism,Perseverance,Adaptability,Self-Esteem,Empathy,Impulsiveness,Discipline,Ambition,Social Acuity,Independence,Altruism
0,0.642857,0.571429,0.806452,0.858824,0.625,0.949495,0.611111,0.89899,0.202128,0.752688,0.785714,1.0,0.684211,0.79798,0.402062,0.915789,0.736842,0.833333,0.578947,0.848485
1,0.142857,0.142857,1.0,0.176471,0.5,0.848485,0.277778,1.0,0.414894,0.591398,0.857143,0.547945,0.052632,0.949495,0.762887,0.789474,0.157895,0.666667,0.0,0.949495
2,0.428571,0.428571,0.913978,0.470588,0.5625,0.747475,0.722222,0.949495,0.042553,0.698925,0.971429,0.479452,0.631579,0.979798,0.195876,0.842105,0.473684,0.333333,0.421053,0.89899


In [4]:

# 4) Question strategy
# We'll pick the trait with the highest variance among remaining candidates, then ask user "higher or lower?"
# Answer 'H' keeps top half (higher values), 'L' keeps bottom half, 'S' skips (no change).

def choose_next_trait(remaining_idx):
    # compute std dev within remaining for each trait
    sub = traits_norm.loc[remaining_idx]
    stds = sub.std().sort_values(ascending=False)
    return stds.index[0], stds.iloc[0]

def ask_question_for_trait(trait_name):
    prompt = f"Do you prefer *higher* or *lower* **{trait_name}**? (H/L/S to skip): "
    ans = input(prompt).strip().lower()
    if ans not in {"h","l","s"}:
        print("Invalid input. Skipping this trait.")
        ans = "s"
    return ans

def filter_by_answer(remaining_idx, trait_name, answer):
    if answer == "s":
        return remaining_idx  # no change
    sub = traits_norm.loc[remaining_idx, trait_name]
    threshold = sub.median()  # median split to ~halve the pool
    if answer == "h":
        keep = sub[sub >= threshold].index
    else:  # "l"
        keep = sub[sub < threshold].index
    return list(keep)


In [5]:

# 5) Main elimination loop (10 questions or until <= 3 remain)
remaining = list(df.index)
history = []

MAX_Q = 10
for q in range(1, MAX_Q+1):
    if len(remaining) <= 1:
        break
    
    trait, stdv = choose_next_trait(remaining)
    print(f"Q{q}: Trait to ask = {trait} (std={stdv:.3f}) | Remaining candidates: {len(remaining)}")
    ans = ask_question_for_trait(trait)
    history.append((trait, ans))
    remaining = filter_by_answer(remaining, trait, ans)
    print(f" -> After Q{q}, remaining: {len(remaining)}\n")

print("Questioning complete.")
print("History:", history)
print("Remaining indices:", remaining)
df.loc[remaining, [NAME_COL] + trait_cols].head(len(remaining))


Q1: Trait to ask = Cynicism (std=0.378) | Remaining candidates: 25
 -> After Q1, remaining: 12

Q2: Trait to ask = Combat Prowess (std=0.365) | Remaining candidates: 12
 -> After Q2, remaining: 6

Q3: Trait to ask = Adaptability (std=0.377) | Remaining candidates: 6
 -> After Q3, remaining: 3

Q4: Trait to ask = Emotional Stability (std=0.255) | Remaining candidates: 3
 -> After Q4, remaining: 1

Questioning complete.
History: [('Cynicism', 'l'), ('Combat Prowess', 'h'), ('Adaptability', 'h'), ('Emotional Stability', 'l')]
Remaining indices: [5]


Unnamed: 0,name,Agency,Resilience,Nurturance,Assertiveness,Intellect,Combat Prowess,Emotional Stability,Loyalty,Cynicism,...,Perseverance,Adaptability,Self-Esteem,Empathy,Impulsiveness,Discipline,Ambition,Social Acuity,Independence,Altruism
5,Erza Scarlet,96,95,80,99,85,98,50,98,15,...,95,90,75,85,30,98,70,60,85,92


In [6]:

# 6) Final ranking among remaining by "closeness to your answers"
# Build a simple user preference weight vector from history:
# If user preferred 'H' for a trait, weight = +1; for 'L', weight = -1; 'S' -> 0
# Score each candidate by weighted sum of normalized trait z-scores.

import numpy as np

weights = {t:(1 if a=="h" else (-1 if a=="l" else 0)) for t,a in history}
if not weights:
    # fallback: equal weights on most variable traits
    sub = traits_norm.loc[remaining]
    stds = sub.std().sort_values(ascending=False)
    weights = {t:1 for t in stds.index[:5]}

w_vec = np.array([weights.get(t, 0) for t in trait_cols])
subX = traits_norm.loc[remaining, trait_cols].values
scores = subX @ w_vec

rank_df = df.loc[remaining, [NAME_COL, SUMMARY_COL]].copy()
rank_df["score"] = scores
rank_df = rank_df.sort_values("score", ascending=False).reset_index(drop=True)
rank_df.head(len(rank_df))


Unnamed: 0,name,summary,score
0,Erza Scarlet,An S-Class Mage of the Fairy Tail guild known ...,1.276828



## (Optional) 7) Embeddings for nuance (plug in later)
- Compute embeddings for `summary` (e.g., Ollama with an embedding model, or `sentence-transformers` fallback).
- Build a "user preference" textual profile from your H/L answers (e.g., "low cynicism, high empathy…") and embed it.
- Combine trait score and embedding cosine similarity for final ranking.
