In [36]:
from typing import Dict, Any, Optional
import numpy as np
import pandas as pd

df = pd.read_csv("data_tms.csv")

BASE_SURVIVAL_RATE: float = float(df["Survived"].mean())
print("BASE_SURVIVAL_RATE:", round(BASE_SURVIVAL_RATE, 3))

# Вспомогательные функции

def map_age_to_group(age_value: Optional[float]) -> str:
    """Map raw age to coarse group.

    Args:
        age_value: Age in years; can be None/NaN.

    Returns:
        One of: "child" (<= 15), "adult" (15..60], "elder" (> 60).
        If age is missing, returns "adult" by default.
    """
    if age_value <= 15:
        return "child"
    if age_value > 60:
        return "elder"
    return "adult"

def map_family_size_to_group(family_size: int) -> str:
    """Divide the family size into semantic groups.

    Args:
        family_size: FamilySize = SibSp + Parch + 1.

    Returns:
        "alone" for 1, "small" for 2..4, "large" for 5+.
    """
    if family_size == 1:
        return "alone"
    if 2 <= family_size <= 4:
        return "small"
    return "large"

def build_fare_group_function(series_fare: pd.Series):
    """Create a function that maps Fare to buckets using dataset quantiles.

    We split Fare by tertiles (≈33% и 66%.

    Args:
        series_fare: pd.Series with Fare values.

    Returns:
        Callable fare -> "low" / "mid" / "high".
    """
    q33, q66 = series_fare.quantile([0.33, 0.66]).tolist()

    def _fare_to_group(fare_value: Optional[float]) -> str:
        if pd.isna(fare_value):
            return "mid"
        if fare_value <= q33:
            return "low"
        if fare_value <= q66:
            return "mid"
        return "high"
        
    return _fare_to_group

# Дельты

def series_to_deltas(series_mean: pd.Series, base: float) -> Dict[Any, float]:
    """Convert group means to deltas relative to base.

    Args:
        series_mean: Series where index = group key, value = mean Survived.
        base: Global base survival rate.

    Returns:
        Dict[group_key, delta], where delta = group_mean - base.
    """
    deltas: Dict[Any, float] = {}
    for group_key, group_mean in series_mean.items():
        deltas[group_key] = float(group_mean - base)
    return deltas

# Sex
mean_survival_by_sex = df.groupby("Sex")["Survived"].mean()
delta_by_sex = series_to_deltas(mean_survival_by_sex, BASE_SURVIVAL_RATE)
print("delta_by_sex:\n", {k: round(v, 3) for k, v in delta_by_sex.items()})

# Pclass
mean_survival_by_pclass = df.groupby("Pclass")["Survived"].mean()
delta_by_pclass = series_to_deltas(mean_survival_by_pclass, BASE_SURVIVAL_RATE)
print("delta_by_pclass:\n", {k: round(v, 3) for k, v in delta_by_pclass.items()})

# Embarked (NaN -> "S" как самый частый)
df_emb = df.copy()
df_emb["Embarked"] = df_emb["Embarked"].fillna("S")
mean_survival_by_embarked = df_emb.groupby("Embarked")["Survived"].mean()
delta_by_embarked = series_to_deltas(mean_survival_by_embarked, BASE_SURVIVAL_RATE)
print("delta_by_embarked:\n", {k: round(v, 3) for k, v in delta_by_embarked.items()})

# Age groups
df_age = df.copy()
df_age["AgeGroup"] = df_age["Age"].apply(map_age_to_group)
mean_survival_by_agegroup = df_age.groupby("AgeGroup")["Survived"].mean()
delta_by_agegroup = series_to_deltas(mean_survival_by_agegroup, BASE_SURVIVAL_RATE)
print("delta_by_agegroup:\n", {k: round(v, 3) for k, v in delta_by_agegroup.items()})

# Family buckets
df_fam = df.copy()
df_fam["FamilySize"] = df_fam["SibSp"] + df_fam["Parch"] + 1
df_fam["FamBucket"] = df_fam["FamilySize"].apply(map_family_size_to_group)
mean_survival_by_fambucket = df_fam.groupby("FamBucket")["Survived"].mean()
delta_by_fambucket = series_to_deltas(mean_survival_by_fambucket, BASE_SURVIVAL_RATE)
print("delta_by_fambucket:\n", {k: round(v, 3) for k, v in delta_by_fambucket.items()})


# Fare buckets
fare_bucket = build_fare_group_function(df["Fare"])
df_fare = df.copy()
df_fare["FareBucket"] = df_fare["Fare"].apply(fare_bucket)
mean_survival_by_farebucket = df_fare.groupby("FareBucket")["Survived"].mean()
delta_by_farebucket = series_to_deltas(mean_survival_by_farebucket, BASE_SURVIVAL_RATE)
print("delta_by_farebucket:\n", {k: round(v, 3) for k, v in delta_by_farebucket.items()})

BASE_SURVIVAL_RATE: 0.384
delta_by_sex:
 {'female': 0.358, 'male': -0.195}
delta_by_pclass:
 {1: 0.246, 2: 0.089, 3: -0.141}
delta_by_embarked:
 {'C': 0.17, 'Q': 0.006, 'S': -0.045}
delta_by_agegroup:
 {'adult': -0.017, 'child': 0.207, 'elder': -0.157}
delta_by_fambucket:
 {'alone': -0.08, 'large': -0.223, 'small': 0.195}
delta_by_farebucket:
 {'high': 0.175, 'low': -0.18, 'mid': 0.004}


In [37]:
# Веса
WEIGHTS: Dict[str, float] = {
    "sex": 0.35,
    "pclass": 0.30,
    "age": 0.12,
    "family": 0.12,
    "embarked": 0.07,
    "fare": 0.04,
}

def predict_survival(passenger: Dict[str, Any]) -> float:
    """Estimate survival probability using EDA-based deltas and manual weights.

    This is NOT machine learning. We simply start from the global base rate and
    add weighted deltas for groups (sex, pclass, age bucket, family bucket,
    embarked, fare bucket). All group means and deltas are computed from the
    current dataset.

    Args:
        passenger: Dict with keys:
            - "sex": str ("male" | "female")
            - "pclass": int (1 | 2 | 3)
            - "age": float or None
            - "sibsp": int
            - "parch": int
            - "embarked": str ("C" | "Q" | "S") or None
            - "fare": float or None

    Returns:
        Survival probability in [0.0, 1.0], rounded to 3 decimals.
    """
    probability = BASE_SURVIVAL_RATE

    # Sex
    sex_value = str(passenger.get("sex", "")).lower()
    if sex_value in delta_by_sex:
        probability += WEIGHTS["sex"] * delta_by_sex[sex_value]
    
    # Pclass
    pclass_value = passenger.get("pclass", None)
    if pclass_value in delta_by_pclass:
        probability += WEIGHTS["pclass"] * delta_by_pclass[pclass_value]

    # Age
    age_value = passenger.get("age", None)
    age_group = map_age_to_group(age_value)
    if age_group in delta_by_agegroup:
        probability += WEIGHTS["age"] * delta_by_agegroup[age_group]

    # Family
    sibsp_value = int(passenger.get("sibsp", 0))
    parch_value = int(passenger.get("parch", 0))
    family_size = sibsp_value + parch_value + 1
    family_bucket = map_family_size_to_group(family_size)
    if family_bucket in delta_by_fambucket:
        probability += WEIGHTS["family"] * delta_by_fambucket[family_bucket]

    # Embarked
    embarked_value = str(passenger.get("embarked", "S")).upper()
    if embarked_value in delta_by_embarked:
        probability += WEIGHTS["embarked"] * delta_by_embarked[embarked_value]

    # Fare
    fare_value = passenger.get("fare", None)
    fare_bucket_value = fare_bucket(fare_value)
    if fare_bucket_value in delta_by_farebucket:
        probability += WEIGHTS["fare"] * delta_by_farebucket[fare_bucket_value]

    # clip and round
    probability = float(np.clip(probability, 0.0, 1.0))
    return round(probability, 3)

    

In [38]:
examples = [
    {"sex": "female", "pclass": 1, "age": 8,  "sibsp": 1, "parch": 1, "embarked": "C", "fare": 120.0},
    {"sex": "male",   "pclass": 3, "age": 28, "sibsp": 0, "parch": 0, "embarked": "S", "fare": 7.25},
]

for sample in examples:
    print(predict_survival(sample))

0.65
0.251
