In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
import joblib

feature selection

In [4]:
df = pd.read_excel('df_cleaned.xlsx')

In [7]:
df.columns

Index(['Unnamed: 0', 'TransactionId', 'UserId', 'VisitYear', 'VisitMonth',
       'VisitModeId', 'AttractionId', 'Rating', 'AttractionCityId',
       'AttractionTypeId', 'Attraction', 'AttractionAddress', 'VisitMode',
       'ContinentId_x', 'RegionId_x', 'CountryId_x', 'CityId', 'Continent',
       'Region', 'ContinentId_y', 'Country', 'RegionId_y', 'CityName',
       'CountryId_y', 'VisitDate', 'UserAvgRating', 'UserVisitCount',
       'AttrVisitCount', 'VisitMode_Region', 'VisitDay', 'VisitDayOfWeek',
       'UserAvgRating_BUSINESS', 'UserAvgRating_COUPLES',
       'UserAvgRating_FAMILY', 'UserAvgRating_FRIENDS', 'UserAvgRating_SOLO'],
      dtype='object')

In [9]:
X = df[['Continent', 'Region', 'Country', 'CityName', 'VisitYear', 'VisitMonth', 'VisitMode', 'Attraction', 'UserAvgRating', 'UserVisitCount', 'AttrVisitCount']]
y = df['Rating']

Preprocessing

In [11]:
categorical_features = ['Continent', 'Region', 'Country', 'CityName', 'VisitMode', 'Attraction']
numerical_features = ['VisitYear', 'VisitMonth', 'UserAvgRating', 'UserVisitCount', 'AttrVisitCount']

In [12]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)

In [14]:
xgb_model = XGBRegressor(n_estimators=100, random_state=42)

Model Pipeline

In [15]:
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', xgb_model)
])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
pipeline.fit(X_train, y_train)

# Save pipeline
joblib.dump(pipeline, 'pipeline_xgb.pkl')

print("✅ Pipeline trained and saved!")

✅ Pipeline trained and saved!


Model performance Metrices

In [17]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predict on training set
y_train_pred = pipeline.predict(X_train)

# Predict on test set
y_test_pred = pipeline.predict(X_test)

# Calculate metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("🏋️‍♂️ Training Performance:")
print(f"RMSE: {train_rmse:.3f}")
print(f"R²: {train_r2:.3f}")

print("\n🧪 Testing Performance:")
print(f"RMSE: {test_rmse:.3f}")
print(f"R²: {test_r2:.3f}")

🏋️‍♂️ Training Performance:
RMSE: 0.465
R²: 0.771

🧪 Testing Performance:
RMSE: 0.503
R²: 0.728
