In [1]:
import pandas as pd
import numpy as nu
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

# Load the file
Tourism_df=pd.read_csv("F:/DS/4th project Tourism Analysis/DATA/Final_Tourim_Table.csv")

In [2]:
Tourism_df.head(2)

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitModeId,AttractionId,Rating,ContinentId,RegionId,CountryId,...,Continent,Region,Country,CityName,Attraction,AttractionAddress,AttractionTypeId,AttractionCityId,AttractionType,VisitModeName
0,5661,14,2018,12,4,640,4,5,20,155,...,Europe,Southern Europe,Portugal,Lagos,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",63,1,Nature & Wildlife Areas,Friends
1,67652,14,2018,12,4,748,5,5,20,155,...,Europe,Southern Europe,Portugal,Lagos,Tegalalang Rice Terrace,"Jalan Raya Ceking, Tegalalang 80517 Indonesia",72,1,Points of Interest & Landmarks,Friends


In [3]:
Tourism_df.isnull().sum()

TransactionId         0
UserId                0
VisitYear             0
VisitMonth            0
VisitModeId           0
AttractionId          0
Rating                0
ContinentId           0
RegionId              0
CountryId             0
CityId                0
Continent             0
Region               23
Country               0
CityName              0
Attraction            0
AttractionAddress     0
AttractionTypeId      0
AttractionCityId      0
AttractionType        0
VisitModeName         0
dtype: int64

In [4]:
Tourism_df['Region'].fillna(Tourism_df['Region'].mode()[0], inplace=True)
print("After Handling missing Values")
Tourism_df.isnull().sum()

After Handling missing Values


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Tourism_df['Region'].fillna(Tourism_df['Region'].mode()[0], inplace=True)


TransactionId        0
UserId               0
VisitYear            0
VisitMonth           0
VisitModeId          0
AttractionId         0
Rating               0
ContinentId          0
RegionId             0
CountryId            0
CityId               0
Continent            0
Region               0
Country              0
CityName             0
Attraction           0
AttractionAddress    0
AttractionTypeId     0
AttractionCityId     0
AttractionType       0
VisitModeName        0
dtype: int64

In [5]:
# Counting unique value in each column
Tourism_df.nunique()

TransactionId        52922
UserId               33526
VisitYear               10
VisitMonth              12
VisitModeId              5
AttractionId            30
Rating                   5
ContinentId              5
RegionId                22
CountryId              153
CityId                5545
Continent                5
Region                  22
Country                153
CityName              5543
Attraction              30
AttractionAddress       25
AttractionTypeId        17
AttractionCityId         3
AttractionType          17
VisitModeName            5
dtype: int64

In [6]:
# IMPORTING SMOTE FOR HANDLING IMBALANCED DATA AND ENCODERS FOR FEATURE TRANSFORMATION  
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,StandardScaler
from category_encoders import TargetEncoder
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import classification_report,confusion_matrix

In [7]:
# FEATURE SELECTION, ENCODING CATEGORICAL VARIABLES, AND BALANCING DATA USING SMOTE  
#  Step 1: Feature Selection
selected_features = ["UserId", "VisitYear", "VisitMonth", "AttractionId", 
                     "ContinentId", "RegionId", "CountryId","Attraction", "AttractionType", "AttractionTypeId"]

X = Tourism_df[selected_features].copy()
y = Tourism_df["VisitModeName"]

#  Step 2: Encoding Categorical Features
categorical_features = ["Attraction", "AttractionType"]

# One-Hot Encoding for categorical features
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_features = ohe.fit_transform(X[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(categorical_features))

# Label Encoding for target variable (VisitModeName)
label_encoder = LabelEncoder()
y = pd.Series(label_encoder.fit_transform(y))  # Convert to Pandas Series

# Target Encoding for Attraction
target_enc = TargetEncoder()
X["Attraction"] = target_enc.fit_transform(X["Attraction"], y)

# Convert boolean columns to integers
bool_cols = X.select_dtypes(include=["bool"]).columns
X[bool_cols] = X[bool_cols].astype(int)

# Drop original categorical columns and merge encoded ones
X = X.drop(columns=categorical_features)
X = pd.concat([X.reset_index(drop=True), encoded_df], axis=1)

# Step 3: Handle Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

y_smote = pd.DataFrame(y_smote, columns=['VisitModeName'])
balanced_tourism_data = pd.concat([X_smote, y_smote], axis=1)



print("Before SMOTE:")
print(pd.Series(y).value_counts())
print("\nAfter SMOTE:")
print(y_smote['VisitModeName'].value_counts())

Before SMOTE:
1    21617
2    15215
3    10944
4     4523
0      623
Name: count, dtype: int64

After SMOTE:
VisitModeName
3    21617
2    21617
1    21617
4    21617
0    21617
Name: count, dtype: int64


In [8]:
#  Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [9]:
# Feature Scaling (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:


# Train Decision Tree Classifier with max depth of 5
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

# Predict using the trained Decision Tree model
dt_pred = dt_model.predict(X_test)
print(dt_pred)


[0 1 1 ... 1 2 0]


In [11]:
#  Step 8: Model Evaluation
accuracy = accuracy_score(y_test, dt_pred)
precision = precision_score(y_test, dt_pred, average='weighted')
recall = recall_score(y_test, dt_pred, average='weighted')
f1 = f1_score(y_test, dt_pred, average='weighted')

print(f" Accuracy: {accuracy:.2f}")
print(f" Precision: {precision:.2f}")
print(f" Recall: {recall:.2f}")
print(f" F1 Score: {f1:.2f}")



 Accuracy: 0.38
 Precision: 0.43
 Recall: 0.38
 F1 Score: 0.32


In [12]:
# Creates and trains a Random Forest classifier with 100 trees, max depth of 5, and fixed random state.  
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

# Makes predictions on the test data (X_test) using the trained Random Forest model.  
rf_pred = rf_model.predict(X_test)
rf_pred

# Computes evaluation metrics for the model: accuracy, precision, and recall using the test labels (y_test) and predictions (rf_pred).
# Prints the results formatted to two decimal places. 
accuracy = accuracy_score(y_test, rf_pred)
precision = precision_score(y_test, rf_pred, average='weighted')
recall = recall_score(y_test, rf_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, rf_pred, target_names=label_encoder.classes_))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, rf_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.43
Precision: 0.46
Recall: 0.43

Classification Report:
              precision    recall  f1-score   support

    Business       0.42      0.86      0.57      4353
     Couples       0.40      0.67      0.50      4395
      Family       0.62      0.26      0.36      4219
     Friends       0.42      0.07      0.12      4393
        Solo       0.43      0.29      0.35      4257

    accuracy                           0.43     21617
   macro avg       0.46      0.43      0.38     21617
weighted avg       0.46      0.43      0.38     21617


Confusion Matrix:
[[3763  149   14   58  369]
 [ 774 2927  293  126  275]
 [1067 1499 1085  163  405]
 [1546 1595  314  302  636]
 [1793 1102   32   76 1254]]


In [13]:
# Creates an XGBoost classifier with 50 trees, max depth of 3, and a learning rate of 0.2 for optimized training.  
# Trains the model on the training data (X_train, y_train) and makes predictions on the test data (X_test).  

xgb_model = XGBClassifier(n_estimators=300, max_depth=7, learning_rate=0.5,subsample=0.8, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

# Computes accuracy, precision, and recall for the XGBoost model using test labels (y_test) and predictions (xgb_pred).  
# Prints the results formatted to two decimal places.  
accuracy = accuracy_score(y_test, xgb_pred)
precision = precision_score(y_test, xgb_pred, average='weighted')
recall = recall_score(y_test, xgb_pred, average='weighted')
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Accuracy: 0.68
Precision: 0.68
Recall: 0.68


In [14]:
import joblib

# Saveing model
joblib.dump(target_enc,'Target Encoder Model(class).pkl')
print("Target encoded model saved")
joblib.dump(ohe,'One-Hot Endcoder Model(class).pkl')
print("One-Hot Encoded model saved")
joblib.dump(label_encoder,'label_encoder(class).pkl')
print("Lable encoded has been saved")
joblib.dump(xgb_model,'XGBoost model(class).pkl')
print("Best XGBoost Model has been saved")

Target encoded model saved
One-Hot Encoded model saved
Lable encoded has been saved
Best XGBoost Model has been saved
