In [4]:
import pandas as pd
import numpy as np


In [6]:
df = pd.read_csv("../data/processed/survey_cleaned.csv")
print(f"Shape for Data Frame {df.shape}")
df.head()

Shape for Data Frame (29956, 20)


Unnamed: 0,respondent_id,gender,zone,occupation,income_levels,consume_frequency(weekly),current_brand,preferable_consumption_size,awareness_of_other_brands,reasons_for_choosing_brands,flavor_preference,purchase_channel,packaging_preference,health_concerns,typical_consumption_situations,price_range,age_group,cf_ab_score,zas_score,bsi
0,R00001,M,Urban,Working Professional,<10L,3-4 times,Newcomer,Medium (500 ml),0 to 1,Price,Traditional,Online,Simple,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",100-150,26-35,0.67,3,1
1,R00002,F,Metro,Working Professional,> 35L,5-7 times,Established,Medium (500 ml),2 to 4,Quality,Exotic,Retail Store,Premium,Medium (Moderately health-conscious),Social (eg. Parties),200-250,46-55,0.6,20,0
2,R00003,F,Rural,Working Professional,> 35L,3-4 times,Newcomer,Medium (500 ml),2 to 4,Availability,Traditional,Retail Store,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",200-250,36-45,0.5,5,0
3,R00004,F,Urban,Working Professional,16L - 25L,5-7 times,Newcomer,Medium (500 ml),0 to 1,Brand Reputation,Exotic,Online,Eco-Friendly,Low (Not very concerned),"Active (eg. Sports, gym)",150-200,26-35,0.75,9,0
4,R00005,M,Metro,Student,Not Reported,3-4 times,Established,Medium (500 ml),0 to 1,Availability,Traditional,Online,Premium,Medium (Moderately health-conscious),"Active (eg. Sports, gym)",50-100,18-25,0.67,0,0


In [4]:
df.describe()

Unnamed: 0,cf_ab_score,zas_score,bsi
count,29956.0,29956.0,29956.0
mean,0.53736,6.103652,0.305782
std,0.141876,5.517243,0.460745
min,0.25,0.0,0.0
25%,0.5,0.0,0.0
50%,0.5,6.0,0.0
75%,0.67,9.0,1.0
max,0.75,20.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29956 entries, 0 to 29955
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   respondent_id                   29956 non-null  object 
 1   gender                          29956 non-null  object 
 2   zone                            29956 non-null  object 
 3   occupation                      29956 non-null  object 
 4   income_levels                   29956 non-null  object 
 5   consume_frequency(weekly)       29956 non-null  object 
 6   current_brand                   29956 non-null  object 
 7   preferable_consumption_size     29956 non-null  object 
 8   awareness_of_other_brands       29956 non-null  object 
 9   reasons_for_choosing_brands     29956 non-null  object 
 10  flavor_preference               29956 non-null  object 
 11  purchase_channel                29956 non-null  object 
 12  packaging_preference            

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# -------------------------
# X / y
# -------------------------
drop_cols = ['respondent_id', 'price_range']
X = df.drop(columns=drop_cols)
y = df['price_range']

# -------------------------
# Column groups
# -------------------------
label_cols_explicit = [
    'age_group',
    'consume_frequency(weekly)'
]

label_cols_auto = [
    'income_levels',
    'health_concerns',
    'preferable_consumption_size'
]

# all remaining categorical features to one-hot
all_cats = X.select_dtypes(include=['object', 'category']).columns.tolist()
nominal_cols = [c for c in all_cats if c not in (label_cols_explicit + label_cols_auto)]

numeric_cols = ['cf_ab_score', 'zas_score', 'bsi']  # will pass through via remainder

# -------------------------
# Ordinal encoders
# -------------------------
age_group_order = ["18-25", "26-35", "36-45", "46-55", "56-70", "70+"]
consume_freq_order = ["0-2 times", "3-4 times", "5-7 times"]

ord_explicit = OrdinalEncoder(
    categories=[age_group_order, consume_freq_order],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

ord_auto = OrdinalEncoder(
    categories='auto',
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

# -------------------------
# Preprocessor
# -------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("ord_explicit", ord_explicit, label_cols_explicit),
        ("ord_auto", ord_auto, label_cols_auto),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False), nominal_cols),
        # numeric columns flow through via remainder
    ],
    remainder='passthrough'
)

# -------------------------
# Model pipeline
# -------------------------
clf = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1))
])

# -------------------------
# Train / evaluate
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


Accuracy: 0.8943784216851381
              precision    recall  f1-score   support

     100-150       0.89      0.87      0.88      1948
     150-200       0.84      0.89      0.86      2199
     200-250       0.94      0.92      0.93      2428
      50-100       0.92      0.89      0.90       914

    accuracy                           0.89      7489
   macro avg       0.90      0.89      0.89      7489
weighted avg       0.90      0.89      0.89      7489



In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# y is price_range as strings like "50-100", "100-150", ...
le_y = LabelEncoder()
y_enc = le_y.fit_transform(y)          # -> 0..K-1, consistent for all models

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)



In [17]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, n_jobs=-1),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(kernel="rbf", random_state=42),
    "GaussianNB": GaussianNB(),
    "XGBoost": XGBClassifier(
        n_estimators=200, learning_rate=0.1, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42, n_jobs=-1, eval_metric="mlogloss"
    )
}

for name, model in models.items():
    pipe = Pipeline([("prep", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # If you want reports with original string labels:
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(
        le_y.inverse_transform(y_test),
        le_y.inverse_transform(y_pred),
        target_names=le_y.classes_
    ))



=== LogisticRegression ===
Accuracy: 0.8061156362665243
              precision    recall  f1-score   support

     100-150       0.75      0.77      0.76      1948
     150-200       0.75      0.77      0.76      2199
     200-250       0.90      0.89      0.90      2428
      50-100       0.80      0.75      0.78       914

    accuracy                           0.81      7489
   macro avg       0.80      0.80      0.80      7489
weighted avg       0.81      0.81      0.81      7489


=== RandomForest ===
Accuracy: 0.8943784216851381
              precision    recall  f1-score   support

     100-150       0.89      0.87      0.88      1948
     150-200       0.84      0.89      0.86      2199
     200-250       0.94      0.92      0.93      2428
      50-100       0.92      0.89      0.90       914

    accuracy                           0.89      7489
   macro avg       0.90      0.89      0.89      7489
weighted avg       0.90      0.89      0.89      7489


=== GradientBoosting 