# **Import Statement** 

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Read Raw Data

In [2]:
df = pd.read_csv(r'/kaggle/input/tourism-final-master-analytics/Tourism_Final_Master_Analytical.csv')

# **Feature Engineering** 

In [3]:
target = 'User_Rating'
drop_columns = ["UserId", "AttractionId", "Attraction_Name","Destination_Address"]

x = df.drop(columns= drop_columns + [target])
y = df[target]

In [4]:
numerical_features = x.select_dtypes(include=["int64", "float64"]).columns.tolist()

low_cardinality_features = [
    "Traveler_Group_Type",
    "Traveler_Home_Continent",
    "Attraction_Category"
]

high_cardinality_features = [
    "Traveler_Home_Country",
    "Traveler_Home_Region",
    "Destination_Country_Name",
    "Destination_Region_Name",
    "Destination_City_Name"
]


In [6]:
neumerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

Low Cardinality encoding - One-Hot

In [7]:
low_cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

Hign Cardinality encoding - Target

In [8]:
high_cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ('target', TargetEncoder())
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", neumerical_transformer, numerical_features),
        ("low_cat", low_cat_transformer, low_cardinality_features),
        ("high_cat", high_cat_transformer, high_cardinality_features)
    ]
)

In [11]:
random_forest_selector = RandomForestRegressor(
    n_estimators=100,
    random_state= 42,
    n_jobs=-1
)

random_forest_model = RandomForestRegressor(
    n_estimators=300,
    random_state = 42,
    n_jobs=-1
)

In [13]:
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("feature_selection", SelectFromModel(random_forest_selector, threshold="median")),
    ("model", random_forest_model)
])

# **Train Test Split**

In [14]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size = 0.2,
    random_state = 42
)

In [15]:
pipeline.fit(x_train, y_train)

# **Evalution**

In [19]:
y_pred = pipeline.predict(x_test)
print(y_pred)

rmse = mean_squared_error(y_test, y_pred)
rmse = rmse ** 0.5

r2 = r2_score(y_test, y_pred)


print(f"RMSE: {rmse:.3f}")
print(f"R2 Score : {r2:.3f}")

[4.09       4.33       3.71       ... 4.78       4.26666667 3.93333333]
RMSE: 0.910
R2 Score : 0.121


# **Base Line** 

In [21]:
baseline_pred = y_test.mean()
baseline_rsme = mean_squared_error(
    y_test,
    [baseline_pred] * len(y_test)
)
baseline_rsme = baseline_rsme ** 0.5

print(f"baseline_rsme : {baseline_rsme}")

baseline_rsme : 0.970472149544387


# **Saving Model**

In [23]:
import joblib
pipeline.fit(x_train, y_train)
joblib.dump(pipeline, "tourism_rating_model.pkl")

['tourism_rating_model.pkl']