<a href="https://colab.research.google.com/github/IlyaZutler/Project-3-Berlin-Airbnb-Ratings/blob/main/8%20Fine%20Tuning%20with%20Cross-Validation%202.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

!pip install openpyxl -q
import openpyxl
import pickle

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import  make_scorer, mean_squared_error, mean_absolute_error, r2_score

# CatBoost
!pip install catboost -q
from catboost import CatBoostRegressor, Pool
# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor

# LightGBM
import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate, KFold

!pip -qq install category_encoders
from category_encoders import TargetEncoder

# Set display options
%matplotlib inline
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', 1000)

In [None]:
# import pickle

!wget https://github.com/IlyaZutler/Project-3-Berlin-Airbnb-Ratings/raw/main/apartments5.pkl -q

with open('apartments5.pkl', 'rb') as f:
    apartments_initial = pickle.load(f)

# **CatBoost**

In [None]:
apartments = apartments_initial.copy()

In [None]:
apartments.drop(columns=['Host Since','First Review', 'Last Review','Reviews', 'Latitude_Bin', 'Longitude_Bin', 'Latitude_Bin2', 'Longitude_Bi2n','Listing ID', 'Price Log'], inplace=True)

apartments['Is Superhost'] = apartments['Is Superhost'].replace({'t': 1, 'f': 0}).fillna(0).astype(int)
apartments['Instant Bookable'] = apartments['Instant Bookable'].replace({'t': 1, 'f': 0}).fillna(0).astype(int)
apartments['Is Exact Location'] = apartments['Is Exact Location'].replace({'t': 1, 'f': 0}).fillna(0).astype(int)

# Convert non-numeric columns to categorical
cat_features = apartments.select_dtypes(exclude=['number']).columns
apartments[cat_features] = apartments[cat_features].astype('category')

# Fill NaN values in categorical columns with 'missing'
for col in cat_features:
    if 'missing' not in apartments[col].cat.categories:
        apartments[col] = apartments[col].cat.add_categories('missing')
    apartments[col] = apartments[col].fillna('missing')

num_features = apartments.select_dtypes(include=['number']).columns.tolist()

apartments[cat_features] = apartments[cat_features].astype(str)

In [None]:
X = apartments.drop(columns=['Price'])
y = apartments['Price']

cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_features = X.select_dtypes(include=['number']).columns.tolist()

# Define the model
model = CatBoostRegressor(cat_features=cat_features, random_state=42, verbose=0)

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results
train_mse_scores, test_mse_scores = [], []
train_rmse_scores, test_rmse_scores = [], []
train_r2_scores, test_r2_scores = [], []

# Cross-validation loop
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict on both training and test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate and store the metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    train_mse_scores.append(train_mse)
    test_mse_scores.append(test_mse)

    train_rmse_scores.append(train_rmse)
    test_rmse_scores.append(test_rmse)

    train_r2_scores.append(train_r2)
    test_r2_scores.append(test_r2)

# Output results
print("Training MSE scores for each fold:", train_mse_scores)
print("Mean Training MSE:", np.mean(train_mse_scores))
print("Test MSE scores for each fold:", test_mse_scores)
print("Mean Test MSE:", np.mean(test_mse_scores))

print("\nTraining RMSE scores for each fold:", train_rmse_scores)
print("Mean Training RMSE:", np.mean(train_rmse_scores))
print("Test RMSE scores for each fold:", test_rmse_scores)
print("Mean Test RMSE:", np.mean(test_rmse_scores))

print("\nTraining R² scores for each fold:", train_r2_scores)
print("Mean Training R²:", np.mean(train_r2_scores))
print("Test R² scores for each fold:", test_r2_scores)
print("Mean Test R²:", np.mean(test_r2_scores))


Training MSE scores for each fold: [491.39279332989184, 480.2388728077732, 478.97591852466076, 496.48328908184527, 486.7510437956254]
Mean Training MSE: 486.76838350795924
Test MSE scores for each fold: [660.4415224159895, 687.9739324004692, 713.9858506488222, 696.7097544975894, 618.5787381225125]
Mean Test MSE: 675.5379596170766

Training RMSE scores for each fold: [22.16738129166122, 21.91435312318785, 21.885518465977924, 22.281904969769645, 22.062435128417384]
Mean Training RMSE: 22.062318595802804
Test RMSE scores for each fold: [25.699056839035737, 26.229257183543517, 26.720513667383383, 26.395260076339262, 24.871243196159547]
Mean Test RMSE: 25.983066192492295

Training R² scores for each fold: [0.7139917843789786, 0.7182132595062057, 0.717158946787825, 0.7054971198105272, 0.7209388288418102]
Mean Training R²: 0.7151599878650694
Test R² scores for each fold: [0.6053629883792218, 0.6020569162566827, 0.5971143247448181, 0.6133812076144818, 0.6054259910057711]
Mean Test R²: 0.604668

In [None]:
# Extract feature importance
feature_importance = model.get_feature_importance(Pool(X, y, cat_features=cat_features))
importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importance})
importance_df = importance_df.sort_values(by='importance', ascending=False)

# Output the feature importances
importance_df

Unnamed: 0,feature,importance
10,Room Type,12.496155
11,Accomodates,10.877113
13,Bedrooms,8.969312
9,Property Type,6.999996
12,Bathrooms,5.295177
15,Guests Included,4.810831
5,Postal Code,4.437565
35,Distance_from_center,4.303051
3,neighbourhood,4.088789
31,First Review Years,3.534994


# **XGBRegressor**

In [None]:
apartments = apartments_initial.copy()

In [None]:
apartments.drop(columns=['Host Since','First Review', 'Last Review','Reviews', 'Latitude_Bin', 'Longitude_Bin', 'Latitude_Bin2', 'Longitude_Bi2n','Listing ID', 'Price Log'], inplace=True)

apartments['Is Superhost'] = apartments['Is Superhost'].replace({'t': 1, 'f': 0}).fillna(0).astype(int)
apartments['Instant Bookable'] = apartments['Instant Bookable'].replace({'t': 1, 'f': 0}).fillna(0).astype(int)
apartments['Is Exact Location'] = apartments['Is Exact Location'].replace({'t': 1, 'f': 0}).fillna(0).astype(int)


In [None]:
X = apartments.drop(columns=['Price'])
y = apartments['Price']

# Identify categorical and numerical features
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_features = X.select_dtypes(include=['number']).columns.tolist()

# Preprocessor for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), num_features),

        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
        ]), cat_features)
    ])

# Define the model
model = XGBRegressor(colsample_bytree = 0.32,
                     n_estimators = 80, #150, 200
                     gamma = 0.1, # 0.2 хуже
                     subsample = 1, #пробовал 0.7
                     learning_rate = 0.2, # 0.4
                     reg_alpha = 0, #Л1 регуляризация
                     min_child_weight = 1,
                     max_depth = 6,
                     random_state=42)

# Create a pipeline that combines preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results
train_mse_scores, test_mse_scores = [], []
train_rmse_scores, test_rmse_scores = [], []
train_r2_scores, test_r2_scores = [], []

# Cross-validation loop
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Predict on both training and test data
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    # Calculate and store the metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    train_mse_scores.append(train_mse)
    test_mse_scores.append(test_mse)

    train_rmse_scores.append(train_rmse)
    test_rmse_scores.append(test_rmse)

    train_r2_scores.append(train_r2)
    test_r2_scores.append(test_r2)

# Output results
print("Training MSE scores for each fold:", train_mse_scores)
print("Mean Training MSE:", np.mean(train_mse_scores))
print("Test MSE scores for each fold:", test_mse_scores)
print("Mean Test MSE:", np.mean(test_mse_scores))

print("\nTraining RMSE scores for each fold:", train_rmse_scores)
print("Mean Training RMSE:", np.mean(train_rmse_scores))
print("Test RMSE scores for each fold:", test_rmse_scores)
print("Mean Test RMSE:", np.mean(test_rmse_scores))

print("\nTraining R² scores for each fold:", train_r2_scores)
print("Mean Training R²:", np.mean(train_r2_scores))
print("Test R² scores for each fold:", test_r2_scores)
print("Mean Test R²:", np.mean(test_r2_scores))


Training MSE scores for each fold: [452.1233974610468, 453.11668404146803, 444.2306604225464, 445.7405575621153, 454.06377890356816]
Mean Training MSE: 449.855015678149
Test MSE scores for each fold: [707.0434247866888, 700.9645459761849, 732.944632446368, 716.397652902389, 638.5291374477396]
Mean Test MSE: 699.175878711874

Training RMSE scores for each fold: [21.263193491595914, 21.286537624552004, 21.076780124643005, 21.11256871065469, 21.308772346232622]
Mean Training RMSE: 21.209570459535644
Test RMSE scores for each fold: [26.590288166672597, 26.475735041282327, 27.072950198424405, 26.765605782466217, 25.269134085831663]
Mean Test RMSE: 26.434742654935445

Training R² scores for each fold: [0.7368479800607572, 0.7341275754857322, 0.7376764404981584, 0.7355965831960474, 0.7396788943003354]
Mean Training R²: 0.7367854947082062
Test R² scores for each fold: [0.5775167145408663, 0.5945427873303137, 0.5864163233774538, 0.6024559816409178, 0.5927001914306302]
Mean Test R²: 0.5907263996

In [None]:
# Get feature names after One-Hot Encoding
ohe_feature_names = pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(cat_features)
all_feature_names = np.concatenate([num_features, ohe_feature_names])

# Create a DataFrame for feature importances
importance_df = pd.DataFrame({'feature': all_feature_names, 'importance': feature_importance})
importance_df = importance_df.sort_values(by='importance', ascending=False)

# Output the sorted feature importances
for i in range(importance_df.shape[0]):
    for j in apartments.columns:
        if j in importance_df.iloc[i,0]:
            importance_df.iloc[i,0] = j

XGB_imp = importance_df.iloc[:,1].groupby(importance_df.iloc[:,0]).sum().sort_values(ascending=False)
XGB_imp

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
Bin-Bin,0.274697
Bin2-Bin2,0.150687
Postal Code,0.109488
neighbourhood,0.0955
Property Type,0.079756
Room Type,0.069173
Bedrooms,0.045129
Bathrooms,0.036468
Neighborhood Group,0.02811
Accomodates,0.027937
