# Tugas Week 1
- Target = site_review_rating
- Fitur = ["site_review_rating", "hotel_star_rating", "site_review_count", "room_count", "property_type", "city", "hotel_facilities"]

## Model prediksi review rating hotel

In [282]:
import pandas as pd

In [283]:
data = pd.read_csv("../data/Week1_Travel.csv")
data.shape

(4000, 36)

### Menentukan fitur yang akan digunakan

In [330]:
# Mendefinisikan fitur inti
fitur_inti = ["site_review_rating", "hotel_star_rating", "site_review_count", "room_count", "property_type"]

In [331]:
df = data[fitur_inti]
df.head()

Unnamed: 0,site_review_rating,hotel_star_rating,site_review_count,room_count,property_type
0,4.0,2,87.0,17,Resort
1,4.5,0,8.0,18,Guest House
2,2.5,0,2.0,15,Resort
3,5.0,2,1.0,24,Cottage
4,2.8,2,121.0,20,Hotel


### Melakukan EDA 

In [332]:
df.describe()

Unnamed: 0,site_review_rating,hotel_star_rating,site_review_count,room_count
count,2416.0,4000.0,2416.0,4000.0
mean,3.750993,1.306,47.765728,22.20025
std,0.842863,1.479159,93.233924,96.132138
min,0.0,0.0,0.0,0.0
25%,3.4,0.0,4.0,7.0
50%,3.9,1.0,17.0,14.0
75%,4.3,3.0,55.0,26.0
max,5.0,5.0,2094.0,5874.0


In [333]:
df_bersih = df.dropna(subset=["site_review_rating"])
df_bersih = df.dropna()
df_bersih["site_review_rating"].isna().sum()

np.int64(0)

In [334]:
df_bersih.isna().sum()

site_review_rating    0
hotel_star_rating     0
site_review_count     0
room_count            0
property_type         0
dtype: int64

In [335]:
df_bersih.dtypes

site_review_rating    float64
hotel_star_rating       int64
site_review_count     float64
room_count              int64
property_type          object
dtype: object

In [336]:
df_bersih.duplicated().sum()

np.int64(44)

### Cek distribusi data miring (skewed) dan cek distribusi kategori

In [337]:
df_bersih["room_count"].skew() # Miring ke kanan

np.float64(44.740851327912814)

In [338]:
df_bersih["hotel_star_rating"].skew() # Normal

np.float64(0.07350529038429585)

In [339]:
df_bersih['site_review_rating'].skew() # Miring ke kiri

np.float64(-1.289586252147197)

In [340]:
df_bersih["site_review_count"].skew() # Miring ke kanan

np.float64(7.789998704307302)

In [341]:
df_bersih["property_type"].value_counts()

property_type
Hotel                1595
Resort                360
Guest House           122
Service Apartment      85
Homestay               60
Lodge                  52
BnB                    34
Cottage                29
Tent                   20
Houseboat              18
Villa                  13
Bungalow                9
Palace                  8
Hostel                  6
Beach Hut               2
Motel                   2
Farm Stay               1
Name: count, dtype: int64

### Membersihkan outlier pada fitur numerik

In [343]:
num_cols = ["site_review_rating", "hotel_star_rating", "site_review_count", "room_count"]

In [344]:
def remove_outlier(df, column, x):
    df_proses = df_bersih.copy()
    for col in column:
        Q1 = df_proses[col].quantile(0.25)
        Q3 = df_proses[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - x * IQR
        upper_bound = Q1 + x * IQR
        df_proses = df_proses[(df_proses[col] >= lower_bound) & (df_proses[col] <= upper_bound)]
    return df_proses

In [345]:
df_no_outlier = remove_outlier(df_bersih, num_cols, 1.5)
df_no_outlier.head()

Unnamed: 0,site_review_rating,hotel_star_rating,site_review_count,room_count,property_type
0,4.0,2,87.0,17,Resort
1,4.5,0,8.0,18,Guest House
2,2.5,0,2.0,15,Resort
5,3.9,3,16.0,15,Resort
8,3.0,1,1.0,2,Homestay


### Normalisasi data menggunakan StandardScaler( )

In [346]:
from sklearn.preprocessing import StandardScaler

In [347]:
std_scaler = StandardScaler()

In [348]:
df_std_scaled = df_no_outlier.copy()
df_std_scaled[num_cols] = std_scaler.fit_transform(df_no_outlier[num_cols])

In [349]:
df_std_scaled.head()

Unnamed: 0,site_review_rating,hotel_star_rating,site_review_count,room_count,property_type
0,0.419535,0.314092,2.874234,-0.020211,Resort
1,1.261818,-1.124558,-0.626256,0.0821,Guest House
2,-2.107315,-1.124558,-0.892116,-0.224834,Resort
5,0.251078,1.033417,-0.271776,-0.224834,Resort
8,-1.265031,-0.405233,-0.936426,-1.554878,Homestay


In [350]:
df_std_scaled.sample(10)

Unnamed: 0,site_review_rating,hotel_star_rating,site_review_count,room_count,property_type
848,0.419535,-0.405233,-0.759186,1.9237,Resort
2696,-0.591205,1.033417,-0.227466,0.900589,Resort
2724,1.430275,-1.124558,0.082704,-0.327145,Resort
2819,0.756448,1.033417,2.032344,-0.020211,Hotel
1750,-1.433488,-1.124558,-0.847806,0.286722,Guest House
1415,0.082622,0.314092,0.127014,0.798278,Resort
2443,0.082622,0.314092,-0.537636,1.0029,Hotel
1501,-0.085835,-1.124558,0.525804,0.695967,Guest House
2824,0.587992,1.033417,1.810794,0.798278,Hotel
1356,-0.085835,-1.124558,-0.847806,-0.634078,Guest House


In [359]:
df_std_scaled.to_csv("procesed_data.csv", index=False)

In [360]:
df_onehot = pd.get_dummies(df_std_scaled, columns=['property_type'])
df_onehot

Unnamed: 0,site_review_rating,hotel_star_rating,site_review_count,room_count,property_type_Beach Hut,property_type_BnB,property_type_Bungalow,property_type_Cottage,property_type_Guest House,property_type_Homestay,property_type_Hostel,property_type_Hotel,property_type_Houseboat,property_type_Lodge,property_type_Motel,property_type_Palace,property_type_Resort,property_type_Service Apartment,property_type_Tent,property_type_Villa
0,0.419535,0.314092,2.874234,-0.020211,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,1.261818,-1.124558,-0.626256,0.082100,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
2,-2.107315,-1.124558,-0.892116,-0.224834,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
5,0.251078,1.033417,-0.271776,-0.224834,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
8,-1.265031,-0.405233,-0.936426,-1.554878,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3984,0.756448,-1.124558,0.437184,-0.941011,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3988,-0.422748,-1.124558,-0.847806,0.286722,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3989,1.598732,1.033417,1.146144,1.923700,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3990,-1.265031,-1.124558,-0.892116,-1.145634,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False


In [361]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y = df_onehot['site_review_rating']
# Features (X) are all columns except the target
X = df_onehot.drop('site_review_rating', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5. Make predictions on the test data
y_pred = model.predict(X_test)

# 6. Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Evaluation Results:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R²): {r2:.4f}")

Model Evaluation Results:
Mean Absolute Error (MAE): 0.8057
Mean Squared Error (MSE): 1.0567
R-squared (R²): -0.1277


In [357]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [352]:
X = df_std_scaled.drop(["site_review_rating", "property_type"], axis=1)
y = df_std_scaled["site_review_rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [353]:
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.05272004538187214

In [354]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_regression_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print("📊 Regression Evaluation Metrics")
    print(f"MAE  (Mean Absolute Error):      {mae:.4f}")
    print(f"MSE  (Mean Squared Error):       {mse:.4f}")
    print(f"RMSE (Root Mean Squared Error):  {rmse:.4f}")
    print(f"R²   (R-squared Score):          {r2:.4f}")
    
    return {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}


In [355]:
# Misalnya kamu sudah punya y_test dan y_pred hasil dari model.predict()
y_pred = model.predict(X_test)

# Evaluasi
metrics_result = evaluate_regression_model(y_test, y_pred)


📊 Regression Evaluation Metrics
MAE  (Mean Absolute Error):      0.7624
MSE  (Mean Squared Error):       0.8877
RMSE (Root Mean Squared Error):  0.9422
R²   (R-squared Score):          0.0527
