In [None]:
import pandas as pd
import numpy as np


In [None]:
df = pd.read_csv("../data/processed/survey_cleaned.csv")
print(f"Shape for Data Frame {df.shape}")
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# -------------------------
# X / y
# -------------------------
drop_cols = ['respondent_id', 'price_range']
X = df.drop(columns=drop_cols)
y = df['price_range']

# -------------------------
# Column groups
# -------------------------
label_cols_explicit = [
    'age_group',
    'consume_frequency(weekly)'
]

label_cols_auto = [
    'income_levels',
    'health_concerns',
    'preferable_consumption_size'
]

# all remaining categorical features to one-hot
all_cats = X.select_dtypes(include=['object', 'category']).columns.tolist()
nominal_cols = [c for c in all_cats if c not in (label_cols_explicit + label_cols_auto)]

numeric_cols = ['cf_ab_score', 'zas_score', 'bsi']  # will pass through via remainder

# -------------------------
# Ordinal encoders
# -------------------------
age_group_order = ["18-25", "26-35", "36-45", "46-55", "56-70", "70+"]
consume_freq_order = ["0-2 times", "3-4 times", "5-7 times"]

ord_explicit = OrdinalEncoder(
    categories=[age_group_order, consume_freq_order],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

ord_auto = OrdinalEncoder(
    categories='auto',
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

# -------------------------
# Preprocessor
# -------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("ord_explicit", ord_explicit, label_cols_explicit),
        ("ord_auto", ord_auto, label_cols_auto),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False), nominal_cols),
        # numeric columns flow through via remainder
    ],
    remainder='passthrough'
)

# -------------------------
# Model pipeline
# -------------------------
clf = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1))
])

# -------------------------
# Train / evaluate
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# y is price_range as strings like "50-100", "100-150", ...
le_y = LabelEncoder()
y_enc = le_y.fit_transform(y)          # -> 0..K-1, consistent for all models

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)



In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, n_jobs=-1),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(kernel="rbf", random_state=42),
    "GaussianNB": GaussianNB(),
    "XGBoost": XGBClassifier(
        n_estimators=200, learning_rate=0.1, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42, n_jobs=-1, eval_metric="mlogloss"
    )
}

for name, model in models.items():
    pipe = Pipeline([("prep", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # If you want reports with original string labels:
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(
        le_y.inverse_transform(y_test),
        le_y.inverse_transform(y_pred),
        target_names=le_y.classes_
    ))
