In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from tensorflow.keras import layers
from keras import Sequential, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.metrics import Precision, Recall
from sklearn.preprocessing import LabelEncoder
tf.keras.backend.clear_session()
import shap
import lime
import lime.lime_tabular

In [None]:
!ls /kaggle/input


In [None]:
df_hotels = pd.read_csv('/kaggle/input/hotels.csv')
df_reviews = pd.read_csv('/kaggle/input/reviews.csv')
df_users = pd.read_csv('/kaggle/input/users.csv')


In [None]:
df_hotels.head()

In [None]:
df_hotels = df_hotels.drop(['star_rating','lat','lon'], axis=1)

In [None]:
df_hotels.info()

In [None]:
msno.matrix(df_hotels) # if there is white -> null values

In [None]:
df_hotels.duplicated().sum() # shows which cols have duplicates


In [None]:
df_reviews.head()

In [None]:
df_reviews.info()

In [None]:
msno.matrix(df_reviews) #if there is white -> null values

In [None]:
df_reviews.duplicated().sum() # shows which cols have duplicates


In [None]:
df_users.head(10)

In [None]:
df_users.info()

In [None]:
msno.matrix(df_users) #if there is white -> null values

In [None]:
df_users.duplicated().sum() # shows which cols have duplicates


#  Data Engineering Questions

**1. Merge all dfs**

In [None]:
# Merge all dataframes
df_merged = df_reviews.merge(df_users, on='user_id', how='left') \
                      .merge(df_hotels, on='hotel_id', how='left')

df_merged.info()

df_merged[['country_x', 'country_y', 'hotel_name','user_id']]

pd.set_option('display.max_columns', None)


**2. Check correlations between features**

In [None]:
corr = df_merged.corr(numeric_only=True)

# Plot the correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of All Numeric Features")
plt.show()

**3. In this step, we analyze which city received the highest average overall score for each traveller type.**

In [None]:
# Group by traveller type and city to compute average overall score
city_scores = (
    df_merged.groupby(['traveller_type', 'city'])['score_overall']
    .mean()
    .reset_index() # it converts the index levels back into normal columns of the DataFrame.
)



# Find the best city for each traveller type
best_city_per_type = city_scores.loc[
    city_scores.groupby('traveller_type')['score_overall'].idxmax() 
    # idxmax-> finds the index (row label) of the maximum value within each group.
]

print(best_city_per_type)

**4. This block creates bar charts to visualize how each traveller type rated different cities, highlighting the top-rated city for each group.**

In [None]:
for ttype in sorted(city_scores['traveller_type'].dropna().unique()):
    df_t = city_scores[city_scores['traveller_type'] == ttype]

    # Find index of the city with the highest score
    max_idx = df_t['score_overall'].idxmax()

    plt.figure(figsize=(12,6))
    bars = plt.bar(
        df_t['city'],
        df_t['score_overall'],
        color=['orange' if i == max_idx else 'skyblue' for i in df_t.index]
    )

    # Add value labels with extra space above bars
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2,
            height + 0.15,         # ⬅️ Increased from 0.05 to 0.15 for more space
            f"{height:.2f}",
            ha='center', va='bottom',
            rotation=90, fontsize=9
        )

    # Titles and axes
    plt.title(f'Average Overall Score per City — {ttype}', fontsize=14)
    plt.xlabel('City')
    plt.ylabel('Average Overall Score')
    plt.xticks(rotation=45, ha='right')

    # Add top margin to prevent clipping of numbers
    plt.ylim(0, df_t['score_overall'].max() + 1)

    plt.tight_layout()
    plt.show()


**5. This section identifies which countries each age group rated highest in terms of value for money.**

In [None]:
value_for_money_scores = (
    df_merged
    .groupby(['age_group', 'country_y'])['score_value_for_money']
    .mean()
    .reset_index()
)



# For each age_group, pick top 3 countries
top3_value_per_age = (
    value_for_money_scores
    .sort_values(['age_group', 'score_value_for_money'], ascending=[True, False])
    .groupby('age_group')
    .head(3)
)

top3_value_per_age


**6. This section visualizes the top three countries that received the highest “value-for-money” scores for each age group.**

In [None]:
# --- Step 1: prepare and rank your data ---
df = (
    top3_value_per_age
    .sort_values(['age_group', 'score_value_for_money'], ascending=[True, False])
    .copy()
)
df['rank'] = df.groupby('age_group').cumcount() + 1

age_groups = list(df['age_group'].dropna().unique())
age_groups.sort()

# --- Step 2: helper to collect heights and names for each rank ---
def values_for_rank(r):
    vals, names = [], []
    for age in age_groups:
        row = df[(df['age_group'] == age) & (df['rank'] == r)]
        if not row.empty:
            vals.append(float(row['score_value_for_money'].iloc[0]))
            names.append(str(row['country_y'].iloc[0]))
        else:
            vals.append(np.nan)
            names.append("")
    return np.array(vals), np.array(names)

y1, c1 = values_for_rank(1)
y2, c2 = values_for_rank(2)
y3, c3 = values_for_rank(3)

# --- Step 3: bar positions and layout ---
n_groups = len(age_groups)
x = np.arange(n_groups)
bar_w = 0.22
offsets = [-bar_w, 0, bar_w]

plt.figure(figsize=(14, 6))

# --- Step 4: draw bars ---
b1 = plt.bar(x + offsets[0], y1, width=bar_w,color='gold', label='Rank 1 (best)')
b2 = plt.bar(x + offsets[1], y2, width=bar_w,color='silver', label='Rank 2')
b3 = plt.bar(x + offsets[2], y3, width=bar_w,color='brown', label='Rank 3')

# --- Step 5: add country + score labels above bars (with more space) ---
def label_bars(bars, names):
    for bar, name in zip(bars, names):
        h = bar.get_height()
        if np.isnan(h):
            continue
        plt.text(
            bar.get_x() + bar.get_width()/2,
            h + 0.001,                # ⬅️ more vertical space above each label
            f"{name}\n{h:.2f}",
            ha='center', va='bottom',
            rotation=45, fontsize=9
        )

label_bars(b1, c1)
label_bars(b2, c2)
label_bars(b3, c3)

# --- Step 6: labels, legend, and axis styling ---
plt.xticks(x, age_groups, rotation=0)
plt.xlabel('Age Group')
plt.ylabel('Average Value-for-Money Score')
plt.title('Top 3 Countries by Value-for-Money per Age Group')

# move legend farther away to avoid overlap
plt.legend(
    loc='upper left',
    bbox_to_anchor=(1.02, 1.0),
    borderaxespad=0,
    frameon=False
)

# --- Step 7: extra headroom for labels ---
top_y = np.nanmax([y1.max(), y2.max(), y3.max()])
plt.ylim(0, top_y + 1.5)

plt.tight_layout()
plt.show()


# Pre-processing

**This block prepares data to predict the hotel’s region (country_group) using review scores + user attributes.**

In [None]:
# --- Step 1: Map countries to groups ---
country_to_group = {
    'United States': 'North_America',
    'Canada': 'North_America',
    'Germany': 'Western_Europe',
    'France': 'Western_Europe',
    'United Kingdom': 'Western_Europe',
    'Netherlands': 'Western_Europe',
    'Spain': 'Western_Europe',
    'Italy': 'Western_Europe',
    'Russia': 'Eastern_Europe',
    'China': 'East_Asia',
    'Japan': 'East_Asia',
    'South Korea': 'East_Asia',
    'Thailand': 'Southeast_Asia',
    'Singapore': 'Southeast_Asia',
    'United Arab Emirates': 'Middle_East',
    'Turkey': 'Middle_East',
    'Egypt': 'Africa',
    'Nigeria': 'Africa',
    'South Africa': 'Africa',
    'Australia': 'Oceania',
    'New Zealand': 'Oceania',
    'Brazil': 'South_America',
    'Argentina': 'South_America',
    'India': 'South_Asia',
    'Mexico': 'North_America_Mexico'
}

# --- Step 2: Add `country_group` to hotels data ---
df_hotels['country_group'] = df_hotels['country'].map(country_to_group)

# --- Step 3: Merge all dataframes ---
df_merged_new = (
    df_reviews
      .merge(df_users,  on='user_id',  how='left', suffixes=('', '_user'))
      .merge(df_hotels, on='hotel_id', how='left', suffixes=('', '_hotel'))
)

# --- Step 4: Define target ---
y = df_merged_new['country_group']

# --- Step 5: Define features (exclude country, country_group, hotel_id, user_id) ---
categorical_cols = ['user_gender', 'age_group', 'traveller_type']
numeric_cols = [
    'score_overall', 'score_cleanliness', 'score_comfort',
    'score_facilities', 'score_location', 'score_staff', 'score_value_for_money'
]

# Drop rows with missing target (if any)
df_merged_new = df_merged_new.dropna(subset=['country_group'])

# --- Step 6: One-hot encode categorical features ---
ohe_model = OneHotEncoder(
    drop='first',
    sparse_output=False,
    handle_unknown='ignore'
)
X_ohe = ohe_model.fit_transform(df_merged_new[categorical_cols])
ohe_feature_names = ohe_model.get_feature_names_out(categorical_cols)


scaler = StandardScaler()

# --- Encode target labels ---
le = LabelEncoder()
y_encoded = le.fit_transform(y)

#FOR MODEL 2 (WITH COMFORT_BASE)
#########################################################
# --- Step 1: Create a new numeric feature list including comfort_base ---
numeric_cols_with_base = [
    'comfort_base', 
    'score_overall', 'score_cleanliness', 'score_comfort',
    'score_facilities', 'score_location', 'score_staff', 'score_value_for_money'
]

# --- Step 2: Recreate the scaled + encoded features just for model2 ---
X_scaled_with_base = scaler.fit_transform(df_merged_new[numeric_cols_with_base])
X_all_with_base = np.hstack([X_scaled_with_base, X_ohe])  # reuse same OHE

# --- Step 3: Split data (different variable names so it doesn’t overwrite) ---
from sklearn.model_selection import train_test_split

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X_all_with_base, y_encoded,
    test_size=0.2, random_state=42, stratify=y_encoded
)

y_train2 = y_train2.astype(np.int32).flatten()
y_test2 = y_test2.astype(np.int32).flatten()
##########################################################################

# --- Step 7: Scale numerical features ---
X_scaled = scaler.fit_transform(df_merged_new[numeric_cols])

# --- Step 8: Combine all features ---
X_all = np.hstack([X_scaled, X_ohe])

# Create a mapping of encoded values → original labels
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

# Optionally make a reverse mapping too
# reverse_mapping = dict(zip(y_encoded, y))

print(y_encoded)

print("Label Mapping:")
for k, v in label_mapping.items():
    print(f"{k} → {v}")

y.head()

In [None]:
print(X_ohe)
print(ohe_feature_names)

In [None]:
df_merged_new['country_group'].value_counts()

In [None]:
X_all

In [None]:
feature_names = numeric_cols + list(ohe_feature_names)
X_df = pd.DataFrame(X_all, columns=feature_names)

X_df.head()

In [None]:
# y_ohe = OneHotEncoder(drop='first', sparse_output=False)  # drop='first to avoid dummy variable trap
# y_ohe.fit(df_merged_new['country_group'].values.reshape(-1, 1))
# y = y_ohe.transform(df_merged_new['country_group'].values.reshape(-1, 1))
# y



**This block trains three neural-network classifiers to predict the hotel’s region class.**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_encoded,
test_size=0.2, random_state=42,stratify=y_encoded)





y_train = y_train.astype(np.int32).flatten()
y_test = y_test.astype(np.int32).flatten()

print(y_train)
print(X_train)
print(y_test)
print(X_test)


#model for training with class weights
model = Sequential([
Input(shape=(X_train.shape[1],)),
layers.Dense(32, activation='relu'),
layers.Dense(16, activation='relu'),
layers.Dense(8, activation='relu'),
layers.Dense(11, activation='softmax') # multi-class classification
])

model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
 metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')])

model.summary()

#model for training without class weights
model1 = Sequential([
Input(shape=(X_train.shape[1],)),
layers.Dense(32, activation='relu'),
layers.Dense(16, activation='relu'),
layers.Dense(8, activation='relu'),
layers.Dense(11, activation='softmax') # multi-class classification
])

model1.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
 metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')])

model1.summary()

# --- Step 4: Create model2 (same architecture) ---
model2 = Sequential([
    Input(shape=(X_train2.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(11, activation='softmax')  # multi-class classification
])

model2.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')]
)

model2.summary()


# Model Training and Analysis

**This step trains all three models and compares the effect of class balancing and feature engineering.**

In [None]:
from sklearn.utils.class_weight import compute_class_weight
tf.keras.backend.clear_session()


class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_encoded),
    y=y_encoded
)

class_weights_dict = dict(enumerate(class_weights))

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=15,class_weight=class_weights_dict, batch_size=32)

history1 = model1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=15, batch_size=32)

history2 = model2.fit(X_train2, y_train2,validation_data=(X_test2, y_test2),epochs=15,batch_size=32)


**This plot compares how the training accuracy evolves over time (epochs) for all three models.**

In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy (model with class weights)')
plt.plot(history1.history['accuracy'], label='Train Accuracy (model without class weights)')
plt.plot(history2.history['accuracy'], label='Train Accuracy (model with base comfort)')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Training Progress (Accuracy)')
plt.legend()
plt.show()

**This block evaluates the model trained with class weights on the test dataset using multiple performance metrics.**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# --- After model.fit() ---
y_pred_probs = model.predict(X_test)           # predicted probabilities
y_pred = np.argmax(y_pred_probs, axis=1)       # predicted class indices
y_true = y_test                                # true labels

# --- Compute metrics ---
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro')  # macro = treat all classes equally
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# --- Optional: per-class breakdown ---
print("\n Model with class weights Detailed classification report:")
print(classification_report(y_true, y_pred, target_names=le.classes_))


**This block evaluates the baseline model (without class weights) on the same test dataset, to compare it against the weighted version.**

In [None]:
# --- After model.fit() ---
y_pred_probs1 = model1.predict(X_test)           # predicted probabilities
y_pred1 = np.argmax(y_pred_probs1, axis=1)       # predicted class indices

# --- Compute metrics ---
accuracy1 = accuracy_score(y_true, y_pred1)
precision1 = precision_score(y_true, y_pred1, average='macro')  # macro = treat all classes equally
recall1 = recall_score(y_true, y_pred1, average='macro')
f1_1 = f1_score(y_true, y_pred1, average='macro')

print(f"Accuracy:  {accuracy1:.4f}")
print(f"Precision: {precision1:.4f}")
print(f"Recall:    {recall1:.4f}")
print(f"F1 Score:  {f1_1:.4f}")

# --- Optional: per-class breakdown ---
print("\n Model without class weights Detailed classification report:")
print(classification_report(y_true, y_pred1, target_names=le.classes_))

**This block evaluates the third model (model2), which includes the engineered feature comfort_base, on its dedicated test set.**

In [None]:
# --- After model2.fit() ---
y_pred_probs2 = model2.predict(X_test2)           # predicted probabilities
y_pred2 = np.argmax(y_pred_probs2, axis=1)        # predicted class indices

# --- Compute metrics ---
accuracy2 = accuracy_score(y_test2, y_pred2)
precision2 = precision_score(y_test2, y_pred2, average='macro')  # macro = treat all classes equally
recall2 = recall_score(y_test2, y_pred2, average='macro')
f1_2 = f1_score(y_test2, y_pred2, average='macro')

print(f"Accuracy:  {accuracy2:.4f}")
print(f"Precision: {precision2:.4f}")
print(f"Recall:    {recall2:.4f}")
print(f"F1 Score:  {f1_2:.4f}")

# --- Optional: per-class breakdown ---
print("\nModel with comfort_base Detailed classification report:")
print(classification_report(y_test2, y_pred2, target_names=le.classes_))


In [None]:
from sklearn.metrics import pairwise_distances_argmin_min
idx, dist = pairwise_distances_argmin_min(X_test, X_train)
print("Minimum distance between test and train samples:", dist.min())


In [None]:
# predicted_class = np.argmax(y_pred, axis=1)
# predicted_class[:100]

**This block explains the weighted model’s predictions using SHAP to show which features push the model toward or away from a specific class (here: Western_Europe).**

In [None]:
# Wrap your model prediction function
def predict_proba_fn(X):
    preds = model.predict(X)
    return preds
# Use Explainer -> finds the best XAI model to use for your model
# Use a small background (reference) set to speed up explanations
# If X_train_scaled is a DataFrame, shap.sample will preserve columns
background = shap.sample(X_train, 100, random_state=42)
# Let SHAP pick the best algorithm for your Keras model
explainer = shap.Explainer(model, background)
shap_values = explainer(X_test)

# Combine numerical + one-hot encoded categorical names
ohe_feature_names = ohe_model.get_feature_names_out(categorical_cols)
feature_names = numeric_cols + list(ohe_feature_names)

# --- Choose a class index to visualize ---
# For example, visualize SHAP values for "Western_Europe"
class_index = np.where(le.classes_ == "Western_Europe")[0][0]

# Get SHAP values for that class
shap_values_class = shap_values[:, :, class_index]

# --- Global SHAP Summary Plot ---
shap.summary_plot(
    shap_values_class.values,
    X_test,
    feature_names=feature_names,
    show=True
)

# --- Local SHAP Force Plot for one test instance ---
i = 0  # index of instance to explain
shap.initjs()
shap.plots.force(shap_values[i, :, class_index], feature_names=feature_names)

**Since computing SHAP values for the entire test set can be computationally expensive, we take a random sample of 500 test instances to generate interpretable visualizations more efficiently.**

In [None]:
import numpy as np

# --- Sample 500 random instances from X_test ---
sample_size = 500
sample_idx = np.random.choice(len(X_test), sample_size, replace=False)

X_test_sample = X_test.iloc[sample_idx] if isinstance(X_test, pd.DataFrame) else X_test[sample_idx]

# --- Compute SHAP values on the sample only ---
shap_values_sample = explainer(X_test_sample)

# --- Choose the target class ---
class_index = np.where(le.classes_ == "Western_Europe")[0][0]

# Extract SHAP values for that class
shap_values_class = shap_values_sample[:, :, class_index]

# --- Global SHAP Summary Plot ---
shap.summary_plot(
    shap_values_class.values,
    X_test_sample,
    feature_names=feature_names,
    show=True
)

# --- Local Force Plot (for one random instance among the sample) ---
i = 0
shap.initjs()
shap.plots.force(shap_values_sample[i, :, class_index], feature_names=feature_names)

**This section uses LIME to explain how individual feature values influence one specific prediction made by the model.**

In [None]:
# --- Step 1: Define your predict function ---
# LIME expects a function that returns probabilities for each class
predict_fn = lambda x: model.predict(x).astype(float)

# --- Step 2: Create the LIME explainer ---
explainer_lime = lime.lime_tabular.LimeTabularExplainer(
    X_train,                            # numpy array or DataFrame values
    feature_names=feature_names,        # all feature names
    class_names=le.classes_,            # your encoded country_group classes
    discretize_continuous=True,         # discretize continuous vars for interpretability
    mode='classification'               # since this is a multi-class classification task
)

# --- Step 3: Pick one test instance to explain ---
i = 60   # for example, index 60
exp = explainer_lime.explain_instance(
    X_test[70],                          # the test sample
    predict_fn,                         # your model’s predict function
    num_features=10,                    # number of top features to show
    top_labels=1                        # only explain the top predicted label
)

# --- Step 4: Display explanation ---
exp.show_in_notebook(show_table=True, show_all=False)


**This helper takes one user/hotel-review profile, applies the same preprocessing used during training, and returns:
the predicted region (country_group) the full class–probability distribution.**

In [None]:
def predict_country_group(
    user_gender,
    age_group,
    traveller_type,
    score_overall,
    score_cleanliness,
    score_comfort,
    score_facilities,
    score_location,
    score_staff,
    score_value_for_money,
    model,
    ohe_model,
    scaler,
    le
):
    # --- Step 1: Define column order ---
    categorical_cols = ['user_gender', 'age_group', 'traveller_type']
    numeric_cols = [
        'score_overall', 'score_cleanliness', 'score_comfort',
        'score_facilities', 'score_location', 'score_staff', 'score_value_for_money'
    ]

    # --- Step 2: Create single-row DataFrame ---
    data = pd.DataFrame([{
        'user_gender': user_gender,
        'age_group': age_group,
        'traveller_type': traveller_type,
        'score_overall': score_overall,
        'score_cleanliness': score_cleanliness,
        'score_comfort': score_comfort,
        'score_facilities': score_facilities,
        'score_location': score_location,
        'score_staff': score_staff,
        'score_value_for_money': score_value_for_money
    }])

    # --- Step 3: Preprocess categorical and numeric features ---
    X_cat = ohe_model.transform(data[categorical_cols])
    X_num = scaler.transform(data[numeric_cols])

    # --- Step 4: Combine features ---
    X_input = np.hstack([X_num, X_cat])

    # --- Step 5: Predict probabilities ---
    y_proba = model.predict(X_input)
    y_pred = np.argmax(y_proba, axis=1)

    # --- Step 6: Decode label back to country_group ---
    predicted_group = le.inverse_transform(y_pred)[0]

    # --- Step 7: Return both prediction and probabilities ---
    return {
        "predicted_country_group": predicted_group,
        "probabilities": dict(zip(le.classes_, y_proba[0]))
    }


**This step demonstrates how to use the trained model and preprocessing pipeline to predict the most likely country group for a single user review profile.**

In [None]:
result = predict_country_group(
    user_gender='Female',
    age_group='25-34',
    traveller_type='Solo',
    score_overall=8.7,
    score_cleanliness=8.6,
    score_comfort=8.7,
    score_facilities=8.5,
    score_location=9.0,
    score_staff=8.8,
    score_value_for_money=8.7,
    model=model,      # your trained classifier
    ohe_model=ohe_model, # your fitted OneHotEncoder
    scaler=scaler,       # your fitted StandardScaler
    le=le                # your fitted LabelEncoder for country_group
)

print("Predicted Country Group:", result["predicted_country_group"])
print("Class Probabilities:", result["probabilities"])


**This cell allows you to manually input traveler details and receive a predicted country group with its probability distribution.**

In [None]:
print("Please enter the following details:\n")

    # --- Get categorical inputs ---
user_gender = input("User gender (Male/Female): ")
age_group = input("Age group (e.g., 18-24, 25-34, 35-44, etc.): ")
traveller_type = input("Traveller type (e.g., Solo traveller, Couple, Family, Friends): ")

# --- Get numeric inputs ---
comfort_base = float(input("Comfort base (e.g., 3.5): "))
score_overall = float(input("Score overall: "))
score_cleanliness = float(input("Score cleanliness: "))
score_comfort = float(input("Score comfort: "))
score_facilities = float(input("Score facilities: "))
score_location = float(input("Score location: "))
score_staff = float(input("Score staff: "))
score_value_for_money = float(input("Score value for money: "))

result = predict_country_group(
    user_gender,
    age_group,
    traveller_type,
    comfort_base,
    score_overall,
    score_cleanliness,
    score_comfort,
    score_facilities,
    score_location,
    score_staff,
    score_value_for_money,
    model=model,      # your trained classifier
    ohe_model=ohe_model, # your fitted OneHotEncoder
    scaler=scaler,       # your fitted StandardScaler
    le=le                # your fitted LabelEncoder for country_group
)

print("Predicted Country Group:", result["predicted_country_group"])
print("Class Probabilities:", result["probabilities"])

In [None]:
pd.set_option('display.max_columns', None)
df_merged[:1]