In [4]:
# Load the cleaned datasets
%store -r train_df_cleaned
%store -r test_df_cleaned

print("Train shape:", train_df_cleaned.shape)
print("Test shape:", test_df_cleaned.shape)

Train shape: (54110, 34)
Test shape: (20000, 32)


In [5]:
# Define feature columns by type
numeric_features = [
    'accommodates', 'bathrooms', 'bedrooms', 'beds',
    'review_scores_rating', 'number_of_reviews', 'month'
]

binary_features = [
    'has_wifi', 'host_verified', 'host_has_pic', 'instant_bookable'
]

categorical_features = [
    'room_type', 'property_type', 'cancellation_policy', 'city'
]

# Combine all features
feature_cols = numeric_features + binary_features + categorical_features

# Create train and test sets
X_train = train_df_cleaned[feature_cols]
y_train = train_df_cleaned['log_price']  # Using log_price as target
X_test = test_df_cleaned[feature_cols]

print("Features shape:", X_train.shape)
print("Target shape:", y_train.shape)
print("\nFeature columns:", feature_cols)

Features shape: (54110, 15)
Target shape: (54110,)

Feature columns: ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'review_scores_rating', 'number_of_reviews', 'month', 'has_wifi', 'host_verified', 'host_has_pic', 'instant_bookable', 'room_type', 'property_type', 'cancellation_policy', 'city']


In [7]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Create preprocessing pipeline with proper handling for each feature type
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'))
])

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('bin', binary_transformer, binary_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# Create and fit the pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fit the model
print("Training the model...")
pipe.fit(X_train, y_train)

# Get training predictions and evaluate
y_pred_train = pipe.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
train_r2 = r2_score(y_train, y_pred_train)

print("\nModel Performance on Training Data:")
print(f"RMSE (log price): {train_rmse:.4f}")
print(f"R² Score: {train_r2:.4f}")

# Save the trained model
joblib.dump(pipe, '../models/airbnb_price_model.joblib')
print("\nModel saved to '../models/airbnb_price_model.joblib'")

Training the model...





Model Performance on Training Data:
RMSE (log price): 0.4698
R² Score: 0.5619

Model saved to '../models/airbnb_price_model.joblib'
