<a href="https://colab.research.google.com/github/IlyaZutler/Project-3-Berlin-Airbnb-Ratings/blob/main/8%20Fine%20Tuning%20with%20Cross-Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

# Load dataset
data = fetch_california_housing()
X, y = pd.DataFrame(data.data, columns=data.feature_names), pd.Series(data.target, name='target')

# Identify categorical and numerical features
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_features = X.select_dtypes(include=['number']).columns.tolist()

# Preprocessor for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        # Pipeline for numerical features: imputing missing values with mean and standard scaling
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), num_features),

        # Pipeline for categorical features: imputing missing values with 'missing' and One-Hot Encoding
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
        ]), cat_features)
    ])

# Define the model
model = XGBRegressor(random_state=42)

# Create a pipeline that combines preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),  # MSE
    'rmse': make_scorer(mean_squared_error, squared=False),  # RMSE
    'r2': make_scorer(r2_score)  # R^2
}

# Perform cross-validation
cv_results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring)

# Extract the scores
mse_scores = cv_results['test_mse']
rmse_scores = cv_results['test_rmse']
r2_scores = cv_results['test_r2']

# Output results
print("Cross-validated MSE scores for each fold:", mse_scores)
print("Mean cross-validated MSE:", np.mean(mse_scores))
print("\nCross-validated RMSE scores for each fold:", rmse_scores)
print("Mean cross-validated RMSE:", np.mean(rmse_scores))
print("\nCross-validated R² scores for each fold:", r2_scores)
print("Mean cross-validated R²:", np.mean(r2_scores))


Cross-validated MSE scores for each fold: [0.22124934 0.22509293 0.22831866 0.20121234 0.2240755 ]
Mean cross-validated MSE: 0.2199897533755703

Cross-validated RMSE scores for each fold: [0.47037149 0.4744396  0.47782702 0.44856698 0.47336614]
Mean cross-validated RMSE: 0.46891424500555756

Cross-validated R² scores for each fold: [0.83116009 0.83522882 0.82451226 0.8490521  0.83352332]
Mean cross-validated R²: 0.8346953181878028


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
data = fetch_california_housing()
X, y = pd.DataFrame(data.data, columns=data.feature_names), pd.Series(data.target, name='target')

# Identify categorical and numerical features
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_features = X.select_dtypes(include=['number']).columns.tolist()

# Preprocessor for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), num_features),

        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
        ]), cat_features)
    ])

# Define the model
model = XGBRegressor(random_state=42)

# Create a pipeline that combines preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results
train_mse_scores, test_mse_scores = [], []
train_rmse_scores, test_rmse_scores = [], []
train_r2_scores, test_r2_scores = [], []

# Cross-validation loop
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Predict on both training and test data
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    # Calculate and store the metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    train_mse_scores.append(train_mse)
    test_mse_scores.append(test_mse)

    train_rmse_scores.append(train_rmse)
    test_rmse_scores.append(test_rmse)

    train_r2_scores.append(train_r2)
    test_r2_scores.append(test_r2)

# Output results
print("Training MSE scores for each fold:", train_mse_scores)
print("Mean Training MSE:", np.mean(train_mse_scores))
print("Test MSE scores for each fold:", test_mse_scores)
print("Mean Test MSE:", np.mean(test_mse_scores))

print("\nTraining RMSE scores for each fold:", train_rmse_scores)
print("Mean Training RMSE:", np.mean(train_rmse_scores))
print("Test RMSE scores for each fold:", test_rmse_scores)
print("Mean Test RMSE:", np.mean(test_rmse_scores))

print("\nTraining R² scores for each fold:", train_r2_scores)
print("Mean Training R²:", np.mean(train_r2_scores))
print("Test R² scores for each fold:", test_r2_scores)
print("Mean Test R²:", np.mean(test_r2_scores))


Training MSE scores for each fold: [0.0762031714704277, 0.07277610191850484, 0.07266859230986127, 0.07662383155648843, 0.07174400696554749]
Mean Training MSE: 0.07400314084416595
Test MSE scores for each fold: [0.22124933519936377, 0.2250929324507143, 0.2283186604243863, 0.20121233856656748, 0.22407550023681974]
Mean Test MSE: 0.2199897533755703

Training RMSE scores for each fold: [0.27604921928965437, 0.269770461538147, 0.2695711266249805, 0.27681010017065566, 0.2678507176871988]
Mean Training RMSE: 0.2720103250621272
Test RMSE scores for each fold: [0.47037148638003534, 0.474439598316492, 0.4778270193536426, 0.4485669833665508, 0.4733661376110671]
Mean Test RMSE: 0.46891424500555756

Training R² scores for each fold: [0.9429949031197769, 0.9449770869110796, 0.9457354475167609, 0.9424380734055212, 0.945973361336383]
Mean Training R²: 0.9444237744579043
Test R² scores for each fold: [0.8311600890461159, 0.8352288168082878, 0.8245122636548919, 0.8490520970341059, 0.8335233243956125]
Me

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
!pip install catboost -q
from catboost import CatBoostRegressor
from sklearn.datasets import fetch_california_housing

# Load dataset
data = fetch_california_housing()
X, y = pd.DataFrame(data.data, columns=data.feature_names), pd.Series(data.target, name='target')

# Convert some features to categorical for demonstration purposes
# In real scenarios, you would use your actual categorical features
X['MedInc_cat'] = pd.qcut(X['MedInc'], 4, labels=False)
cat_features = ['MedInc_cat']  # Example of categorical features

# Define the model
model = CatBoostRegressor(cat_features=cat_features, random_state=42, verbose=0)

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results
train_mse_scores, test_mse_scores = [], []
train_rmse_scores, test_rmse_scores = [], []
train_r2_scores, test_r2_scores = [], []

# Cross-validation loop
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict on both training and test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate and store the metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    train_mse_scores.append(train_mse)
    test_mse_scores.append(test_mse)

    train_rmse_scores.append(train_rmse)
    test_rmse_scores.append(test_rmse)

    train_r2_scores.append(train_r2)
    test_r2_scores.append(test_r2)

# Output results
print("Training MSE scores for each fold:", train_mse_scores)
print("Mean Training MSE:", np.mean(train_mse_scores))
print("Test MSE scores for each fold:", test_mse_scores)
print("Mean Test MSE:", np.mean(test_mse_scores))

print("\nTraining RMSE scores for each fold:", train_rmse_scores)
print("Mean Training RMSE:", np.mean(train_rmse_scores))
print("Test RMSE scores for each fold:", test_rmse_scores)
print("Mean Test RMSE:", np.mean(test_rmse_scores))

print("\nTraining R² scores for each fold:", train_r2_scores)
print("Mean Training R²:", np.mean(train_r2_scores))
print("Test R² scores for each fold:", test_r2_scores)
print("Mean Test R²:", np.mean(test_r2_scores))


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hTraining MSE scores for each fold: [0.11823804450125669, 0.11759904057109931, 0.11545677354299586, 0.12286598692104243, 0.1168219211887177]
Mean Training MSE: 0.1181963533450224
Test MSE scores for each fold: [0.19949041398454545, 0.1980969006953708, 0.20157876012545273, 0.1846163184784215, 0.20823236199596035]
Mean Test MSE: 0.19840295105595018

Training RMSE scores for each fold: [0.3438575933453509, 0.34292716511104704, 0.339789307576027, 0.3505224485265422, 0.34179221932150194]
Mean Training RMSE: 0.3437777467760938
Test RMSE scores for each fold: [0.44664349764050687, 0.445080779966256, 0.44897523330964784, 0.42967001114625336, 0.45632484262415557]
Mean Test RMSE: 0.445338872937364

Training R² scores for each fold: [0.9115499912711909, 0.9110883707960906, 0.913783796433619, 0.9076996963419489, 0.9120275547603124]
Mean Training R²: 0.9112298819206324
Test R² 