In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import ADASYN
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('tourism_data.csv ')
df

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitMode,AttractionId,Rating,AttractionCityId,AttractionTypeId,Attraction,...,AttractionType,ContenentId,RegionId,CountryId,CityId,CityName,Contenent,Country,Region,Overall_Avg_Rating
0,3,70456,2022,10,2,640,5,1,63,Sacred Monkey Forest Sanctuary,...,Nature & Wildlife Areas,1,1,1,4341.0,Douala,Africa,Cameroon,Central Africa,4.267086
1,8,7567,2022,10,4,640,5,1,63,Sacred Monkey Forest Sanctuary,...,Nature & Wildlife Areas,1,1,1,464.0,Douala,Africa,Cameroon,Central Africa,4.267086
2,9,79069,2022,10,3,640,5,1,63,Sacred Monkey Forest Sanctuary,...,Nature & Wildlife Areas,1,1,1,774.0,Douala,Africa,Cameroon,Central Africa,4.267086
3,10,31019,2022,10,3,640,3,1,63,Sacred Monkey Forest Sanctuary,...,Nature & Wildlife Areas,1,1,1,583.0,Douala,Africa,Cameroon,Central Africa,4.267086
4,15,43611,2022,10,2,640,3,1,63,Sacred Monkey Forest Sanctuary,...,Nature & Wildlife Areas,1,1,1,1396.0,Douala,Africa,Cameroon,Central Africa,4.267086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52925,211227,87100,2018,9,2,1297,4,3,44,Yogyakarta Palace,...,Historic Sites,1,1,2,7460.0,N'Djamena,Africa,Chad,Central Africa,3.538847
52926,211238,88112,2016,2,2,1297,5,3,44,Yogyakarta Palace,...,Historic Sites,1,1,2,6164.0,N'Djamena,Africa,Chad,Central Africa,3.538847
52927,211239,88112,2016,2,2,1297,4,3,44,Yogyakarta Palace,...,Historic Sites,1,1,2,6164.0,N'Djamena,Africa,Chad,Central Africa,3.538847
52928,211240,88112,2016,2,2,1297,4,3,44,Yogyakarta Palace,...,Historic Sites,1,1,2,6164.0,N'Djamena,Africa,Chad,Central Africa,3.538847


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52930 entries, 0 to 52929
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   TransactionId       52930 non-null  int64  
 1   UserId              52930 non-null  int64  
 2   VisitYear           52930 non-null  int64  
 3   VisitMonth          52930 non-null  int64  
 4   VisitMode           52930 non-null  int64  
 5   AttractionId        52930 non-null  int64  
 6   Rating              52930 non-null  int64  
 7   AttractionCityId    52930 non-null  int64  
 8   AttractionTypeId    52930 non-null  int64  
 9   Attraction          52930 non-null  object 
 10  AttractionAddress   52930 non-null  object 
 11  AttractionType      52930 non-null  object 
 12  ContenentId         52930 non-null  int64  
 13  RegionId            52930 non-null  int64  
 14  CountryId           52930 non-null  int64  
 15  CityId              52922 non-null  float64
 16  City

In [4]:
# Drop unnecessary columns
drop_cols = ['TransactionId', 'AttractionId', 'AttractionAddress', 'ContenentId', 'CountryId', 'RegionId','AttractionTypeId','CityId','AttractionCityId','UserId']
df.drop(columns=drop_cols, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52930 entries, 0 to 52929
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   VisitYear           52930 non-null  int64  
 1   VisitMonth          52930 non-null  int64  
 2   VisitMode           52930 non-null  int64  
 3   Rating              52930 non-null  int64  
 4   Attraction          52930 non-null  object 
 5   AttractionType      52930 non-null  object 
 6   CityName            52930 non-null  object 
 7   Contenent           52930 non-null  object 
 8   Country             52930 non-null  object 
 9   Region              52930 non-null  object 
 10  Overall_Avg_Rating  52930 non-null  float64
dtypes: float64(1), int64(4), object(6)
memory usage: 4.4+ MB


In [6]:
# Label Encoding for Categorical Columns
label_encoders = {}
categorical_cols = ['Attraction', 'AttractionType', 'CityName', 'Contenent', 'Country', 'Region', 'VisitMode']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [7]:
# Handling outliers using IQR method
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

In [8]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52930 entries, 0 to 52929
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   VisitYear           52930 non-null  float64
 1   VisitMonth          52930 non-null  float64
 2   VisitMode           52930 non-null  float64
 3   Rating              52930 non-null  float64
 4   Attraction          52930 non-null  int32  
 5   AttractionType      52930 non-null  int32  
 6   CityName            52930 non-null  int32  
 7   Contenent           52930 non-null  int32  
 8   Country             52930 non-null  int32  
 9   Region              52930 non-null  int32  
 10  Overall_Avg_Rating  52930 non-null  float64
dtypes: float64(5), int32(6)
memory usage: 3.2 MB


In [9]:
# Splitting data into features and target
X = df.drop(columns=['VisitMode'])
y = df['VisitMode']

In [10]:
# Scaling numerical features
scaler = MinMaxScaler()
X_SCALED = scaler.fit_transform(X)

In [11]:
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_SCALED, y)

In [12]:
# Proceed with train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

In [13]:
# Initialize XGBoost Classifier
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')


In [15]:
# Hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [16]:
# Best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'colsample_bytree': 1, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}


In [17]:
# Train final model with best parameters
best_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='mlogloss')
best_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [18]:
# Predictions
y_pred = best_model.predict(X_test)


In [19]:
# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5162342135476464
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.90      0.87      4322
         1.0       0.36      0.50      0.42      4324
         2.0       0.43      0.38      0.40      4216
         3.0       0.40      0.23      0.29      4263
         4.0       0.53      0.56      0.54      4650

    accuracy                           0.52     21775
   macro avg       0.51      0.51      0.51     21775
weighted avg       0.51      0.52      0.51     21775



In [20]:
import joblib

joblib.dump(best_model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

['label_encoders.pkl']