In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from scipy.spatial.distance import cosine

#dummy for genre column
import random


# Configuration
SIZE_BUCKETS = {
    'short': (0, 250),
    'medium': (250, 500),
    'long': (500, 700),
    'extra_long': (700, np.inf)
}

WEIGHTS = {
    'genre': 0.5,
    'length': 0.25,
    'reference': 0.25
}

# 1. Data loading
def load_data(full_path: str, niche_path: str) -> (pd.DataFrame, pd.DataFrame):
    """
    Load the full catalog and niche dataset from Excel files.
    Returns two DataFrames: full_df, niche_df.
    """
    full_df = pd.read_excel(full_path)
    niche_df = pd.read_excel(niche_path)
    return full_df, niche_df

# 2. Preprocessing
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    Parse genres, ensure pages numeric, assign size buckets.
    """
    df['genre_list'] = df['genre'].str.split(',')
    df['num_pages'] = pd.to_numeric(df['num_pages'], errors='coerce')

    def assign_bucket(pages):
        for label, (lo, hi) in SIZE_BUCKETS.items():
            if lo <= pages < hi:
                return label
        return None

    df['size_bucket'] = df['num_pages'].apply(assign_bucket)
    return df

# 3. User input handling
def get_user_preferences(genres: list, full_df: pd.DataFrame) -> dict:
    """
    Prompt user for preferred genres, size bucket, and perform reference rounds.
    Returns a dict with keys: 'genres', 'size_bucket', 'references'.
    """
    prefs = {'genres': [], 'size_bucket': None, 'references': []}

    # Program introduction
    print("\nWelcome to the Niche Book Recommender!")
    print("We will suggest high-quality niche books based on your preferred genres, desired length, and sample choices.\n")

    # 3.1 Genre selection
    print("Available genres:")
    for idx, g in enumerate(genres, start=1):
        print(f"{idx}. {g}")
    sel = input("Select one or more genres by number (comma-separated): ")
    while True:
        try:
            choices = [int(x.strip()) for x in sel.split(',')]
            prefs['genres'] = [genres[i-1] for i in choices]
            break
        except Exception:
            sel = input("Invalid selection. Enter genre numbers separated by commas: ")

    # 3.2 Size bucket selection
    print("\nSize buckets (page ranges):")
    for idx, (label, (lo, hi)) in enumerate(SIZE_BUCKETS.items(), start=1):
        hi_display = f"{int(hi)}" if np.isfinite(hi) else "and above"
        print(f"{idx}. {label} ({int(lo)} to {hi_display} pages)")
    sb = input("Select a size bucket by number: ")
    while True:
        try:
            i = int(sb.strip())
            key = list(SIZE_BUCKETS.keys())[i-1]
            prefs['size_bucket'] = key
            break
        except Exception:
            sb = input("Invalid selection. Enter the number for your size bucket: ")

    # 3.3 Reference-book rounds
    previous_choice = None
    for round_num in range(1, 4):
        mask_genre = full_df['genre_list'].apply(lambda gl: bool(set(gl) & set(prefs['genres'])))
        mask_bucket = full_df['size_bucket'] == prefs['size_bucket']
        candidates_df = full_df[mask_genre & mask_bucket].copy()
        if 'ratings_count' in candidates_df.columns:
            candidates_df = candidates_df.sort_values('ratings_count', ascending=False)
        top_df = candidates_df.head(50)

        if previous_choice is not None and prefs['references']:
            chosen = prefs['references'][-1]
            pool = top_df[top_df.index != chosen.name]
            pair = pd.concat([chosen.to_frame().T, pool.sample(1)])
        else:
            pair = top_df.sample(2)

        pair = pair.reset_index(drop=True)
        print(f"\nRound {round_num}: choose a book or enter 'n' if you haven't read either:")
        for idx, row in pair.iterrows():
            print(f"{idx+1}. {row['title']} by {row['authors']} ({int(row['num_pages'])} pages)")
        choice = input("Enter 1 or 2, or 'n': ")
        while choice not in ['1', '2', 'n']:
            choice = input("Invalid. Enter 1, 2, or 'n': ")
        if choice in ['1', '2']:
            sel_row = pair.iloc[int(choice)-1]
            prefs['references'].append(sel_row)
            previous_choice = sel_row
        else:
            print("No reference selected this round.")

    return prefs

# 4. Build user profile
def build_user_profile(prefs: dict, full_df: pd.DataFrame) -> dict:
    mlb = MultiLabelBinarizer()
    mlb.fit(full_df['genre_list'].dropna())
    user_genre_vec = mlb.transform([prefs['genres']])[0]

    scaler = MinMaxScaler()
    scaler.fit(full_df['num_pages'].values.reshape(-1, 1))
    lo, hi = SIZE_BUCKETS[prefs['size_bucket']]
    if np.isinf(hi):
        hi = full_df['num_pages'].max()
    midpoint = (lo + hi) / 2
    length_target = scaler.transform([[midpoint]])[0][0]

    ref_vecs = []
    for ref in prefs['references']:
        g_vec = mlb.transform([ref['genre_list']])[0]
        p_norm = scaler.transform([[ref['num_pages']]])[0][0]
        ref_vecs.append(np.concatenate([g_vec, [p_norm]]))
    reference_centroid = np.mean(ref_vecs, axis=0) if ref_vecs else np.zeros(len(mlb.classes_) + 1)

    return {
        'mlb': mlb,
        'scaler': scaler,
        'genre_vector': user_genre_vec,
        'length_target': length_target,
        'reference_centroid': reference_centroid,
        'size_bucket': prefs['size_bucket']
    }

# 5. Scoring candidates
def score_candidates(niche_df: pd.DataFrame, profile: dict, weights: dict) -> pd.DataFrame:
    scored = niche_df.copy()
    mlb = profile['mlb']
    scaler = profile['scaler']
    user_genre_vec = profile['genre_vector']
    size_bucket = profile['size_bucket']
    ref_centroid = profile['reference_centroid']
    has_ref = np.linalg.norm(ref_centroid) > 0

    def compute_score(row):
        book_genre_vec = mlb.transform([row['genre_list']])[0]
        genre_score = np.dot(book_genre_vec, user_genre_vec) / user_genre_vec.sum() if user_genre_vec.sum()>0 else 0.0

        pages = row['num_pages']
        lo, hi = SIZE_BUCKETS[size_bucket]
        if lo <= pages < hi:
            length_score = 1.0
        else:
            span = lo if np.isinf(hi) else hi-lo
            dist = lo-pages if pages<lo else pages-hi
            length_score = max(0.0, 1 - (dist/span))

        if has_ref:
            book_vec = np.concatenate([book_genre_vec, [scaler.transform([[pages]])[0][0]]])
            ref_sim = 1 - cosine(book_vec, ref_centroid)
            ref_score = 0.0 if np.isnan(ref_sim) else ref_sim
        else:
            ref_score = 0.0

        return (weights['genre']*genre_score + weights['length']*length_score + weights['reference']*ref_score)

    scored['score'] = scored.apply(compute_score, axis=1)
    return scored

# 6. Select top N
def select_top_n(scored_df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
    return scored_df.sort_values(by=['score', 'ratings_count'], ascending=[False, True]).head(n)

# 7. Main
def main():
    full_df, niche_df = load_data('books.xlsx', 'over 4 and under 200.xlsx')

    # dummy for genre column
    dummy_genres = ['TestGenreA', 'TestGenreB', 'TestGenreC']
    full_df['genre'] = full_df.apply(lambda _: ','.join(random.sample(dummy_genres, k=random.randint(1,2))), axis=1)
    niche_df['genre'] = niche_df.apply(lambda _: ','.join(random.sample(dummy_genres, k=random.randint(1,2))), axis=1)

    full_df = preprocess(full_df)
    niche_df = preprocess(niche_df)

    all_genres = sorted({g for sub in full_df['genre_list'].dropna() for g in sub})
    prefs = get_user_preferences(all_genres, full_df)

    profile = build_user_profile(prefs, full_df)
    scored = score_candidates(niche_df, profile, WEIGHTS)
    recommendations = select_top_n(scored)

    # Explanation of calculation
    print("\nCalculation details:")
    print(f"Genre weight: {WEIGHTS['genre']}")
    print(f"Length weight: {WEIGHTS['length']}")
    print(f"Reference weight: {WEIGHTS['reference']}\n")
    print(f"Selected genres: {prefs['genres']}")
    print(f"Selected size bucket: {prefs['size_bucket']}\n")
    if prefs['references']:
        print("Reference books chosen:")
        for ref in prefs['references']:
            print(f" - {ref['title']} by {ref['authors']}")
    else:
        print("No reference books selected.")

    # Final recommendations
    print("\nBased on your selections, here are the recommended niche books:\n")
    results_df = recommendations[['title', 'authors', 'average_rating', 'num_pages', 'genre_list', 'score']]
    print(results_df.to_string(index=False))

if __name__ == '__main__':
    main()



Welcome to the Niche Book Recommender!
We will suggest high-quality niche books based on your preferred genres, desired length, and sample choices.

Available genres:
1. TestGenreA
2. TestGenreB
3. TestGenreC

Size buckets (page ranges):
1. short (0 to 250 pages)
2. medium (250 to 500 pages)
3. long (500 to 700 pages)
4. extra_long (700 to and above pages)

Round 1: choose a book or enter 'n' if you haven't read either:
1. The Physician (Cole Family Trilogy  #1) by Noah Gordon (720 pages)
2. The Count of Monte Cristo by Alexandre Dumas/Robin Buss (1276 pages)

Round 2: choose a book or enter 'n' if you haven't read either:
1. The Physician (Cole Family Trilogy  #1) by Noah Gordon (720 pages)
2. War and Remembrance (The Henry Family  #2) by Herman Wouk (1042 pages)

Round 3: choose a book or enter 'n' if you haven't read either:
1. The Physician (Cole Family Trilogy  #1) by Noah Gordon (720 pages)
2. Harry Potter and the Order of the Phoenix (Harry Potter  #5) by J.K. Rowling/Mary Gran