In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [16]:
df = pd.read_excel('df_cleaned.xlsx')

Data Cleaning and feature selection

In [17]:
features = ['Continent', 'Region', 'CityName', 'Country', 'AttractionTypeId', 
            'Attraction', 'UserId', 'UserAvgRating', 'AttrVisitCount', 
            'VisitMode_Region', 'VisitMonth', 'VisitYear']

target = 'VisitMode'

In [18]:
X = df[features]
y = df[target]

In [19]:
categorical_features = ['Continent', 'Region', 'CityName', 'Country', 
                        'AttractionTypeId', 'Attraction', 'UserId', 'VisitMode_Region']

numeric_features = ['UserAvgRating', 'AttrVisitCount', 'VisitMonth', 'VisitYear']

Preprocessing

In [20]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ])


Pipeline building

In [21]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # KNN with k=5
])

Model Training

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
joblib.dump(pipeline, 'knn_pipeline_model.pkl')

['knn_pipeline_model.pkl']