In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Import various models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [2]:
# Load the datasets
train_df = pd.read_csv("train.csv")
validation_df = pd.read_csv("validation.csv")

In [3]:
# Basic exploration
print(f"Training set shape: {train_df.shape}")
print(f"Validation set shape: {validation_df.shape}")

Training set shape: (1200, 8)
Validation set shape: (300, 7)


In [7]:
train_df.dtypes

house_id                  int64
city                     object
sale_price                int64
sale_date        datetime64[ns]
months_listed           float64
bedrooms                  int64
house_type               object
area                    float64
dtype: object

In [6]:
train_df['sale_date'] = pd.to_datetime(train_df['sale_date'])

In [8]:
train_df.head()

Unnamed: 0,house_id,city,sale_price,sale_date,months_listed,bedrooms,house_type,area
0,1634561,Teasdale,401869,2021-12-14,7.0,6,Detached,519.7
1,1009770,Silvertown,372387,2022-09-11,8.1,6,Detached,507.8
2,1946667,Silvertown,325473,2020-08-19,5.4,5,Detached,466.8
3,1798290,Silvertown,349469,2022-12-10,6.4,5,Detached,499.4
4,1533461,Poppleton,199995,2020-04-07,4.3,4,Detached,335.0


In [27]:
# Define features
numeric_features = ['house_id', 'bedrooms', 'area'] 
                   
categorical_features = ['city', 'house_type']

In [28]:
# Prepare the data
X = train_df[numeric_features + categorical_features]
y = train_df['sale_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Create preprocessor
# numeric_transformer = Pipeline(steps=[
#    ('imputer', SimpleImputer(strategy='median')),
#    ('scaler', StandardScaler())
#])

categorical_transformer = Pipeline(steps=[
#    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
#        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [30]:
# Function to evaluate models
def evaluate_model(model_name, model, X_train, X_test, y_train, y_test):
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Fit
    pipeline.fit(X_train, y_train)
    
    # Predict
    y_pred = pipeline.predict(X_test)
    
    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores.mean())
    
    print(f"{model_name}:")
    print(f"  RMSE: ${rmse:.2f}")
    print(f"  MAE: ${mae:.2f}")
    print(f"  R²: {r2:.4f}")
    print(f"  CV RMSE: ${cv_rmse:.2f}")
    print("-----------------------------")
    
    return pipeline, rmse, r2, cv_rmse

In [31]:
# Define models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "SVR": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
   # "LightGBM": LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

In [32]:
# Evaluate all models
results = {}
best_rmse = float('inf')
best_model_name = None
best_pipeline = None

for name, model in models.items():
    try:
        print(f"Training {name}...")
        pipeline, rmse, r2, cv_rmse = evaluate_model(name, model, X_train, X_test, y_train, y_test)
        results[name] = {'rmse': rmse, 'r2': r2, 'cv_rmse': cv_rmse, 'pipeline': pipeline}
        
        if cv_rmse < best_rmse:
            best_rmse = cv_rmse
            best_model_name = name
            best_pipeline = pipeline
            
    except Exception as e:
        print(f"Error with {name}: {e}")

print(f"\nBest model: {best_model_name} with CV RMSE: ${best_rmse:.2f}")

Training Linear Regression...
Linear Regression:
  RMSE: $68653.76
  MAE: $54007.45
  R²: 0.6773
  CV RMSE: $67382.04
-----------------------------
Training Ridge Regression...
Ridge Regression:
  RMSE: $68654.27
  MAE: $53976.07
  R²: 0.6773
  CV RMSE: $67382.45
-----------------------------
Training Lasso Regression...
Lasso Regression:
  RMSE: $68653.76
  MAE: $54007.45
  R²: 0.6773
  CV RMSE: $67382.04
-----------------------------
Training ElasticNet...
ElasticNet:
  RMSE: $70508.99
  MAE: $55281.67
  R²: 0.6597
  CV RMSE: $69192.60
-----------------------------
Training Random Forest...
Random Forest:
  RMSE: $67629.07
  MAE: $51731.41
  R²: 0.6869
  CV RMSE: $66337.37
-----------------------------
Training Gradient Boosting...
Gradient Boosting:
  RMSE: $67600.80
  MAE: $51682.30
  R²: 0.6872
  CV RMSE: $66337.41
-----------------------------
Training SVR...
SVR:
  RMSE: $120880.21
  MAE: $101785.91
  R²: -0.0003
  CV RMSE: $117539.41
-----------------------------
Training KNN..

In [33]:
# Create results dataframe for easy comparison
comparison_df = pd.DataFrame({
    'Model': list(results.keys()), 
    'RMSE': [results[model]['rmse'] for model in results],
    'R²': [results[model]['r2'] for model in results],
    'CV RMSE': [results[model]['cv_rmse'] for model in results]
}).sort_values('CV RMSE')

print("\nModel Comparison:")
comparison_df


Model Comparison:


Unnamed: 0,Model,RMSE,R²,CV RMSE
4,Random Forest,67629.07161,0.686902,66337.368339
5,Gradient Boosting,67600.798839,0.687164,66337.413281
8,XGBoost,67605.549595,0.68712,66339.156703
0,Linear Regression,68653.763193,0.677343,67382.044577
2,Lasso Regression,68653.7634,0.677343,67382.044775
1,Ridge Regression,68654.272047,0.677338,67382.449397
3,ElasticNet,70508.989339,0.659669,69192.597369
7,KNN,80665.226532,0.554564,72540.566142
6,SVR,120880.213152,-0.000284,117539.408618


In [34]:
# Hyperparameter tuning for the best model
if best_model_name == "Random Forest":
    print("\nPerforming hyperparameter tuning for Random Forest...")
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10]
    }
    
elif best_model_name == "Gradient Boosting":
    print("\nPerforming hyperparameter tuning for Gradient Boosting...")
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    }
    
elif best_model_name == "XGBoost":
    print("\nPerforming hyperparameter tuning for XGBoost...")
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7],
        'model__colsample_bytree': [0.7, 0.8, 0.9]
    }
    
elif best_model_name == "LightGBM":
    print("\nPerforming hyperparameter tuning for LightGBM...")
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__num_leaves': [31, 50, 70],
        'model__feature_fraction': [0.7, 0.8, 0.9]
    }
    
else:
    # Default to Ridge if best model is not one of the above
    print(f"\nPerforming hyperparameter tuning for {best_model_name}...")
    param_grid = {
        'model__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
    }

# GridSearch with cross-validation
grid_search = GridSearchCV(
    best_pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(X, y)


Performing hyperparameter tuning for Random Forest...


In [35]:
# Best parameters and score
print("Best parameters:", grid_search.best_params_)
best_cv_rmse = np.sqrt(-grid_search.best_score_)
print(f"Best CV RMSE: ${best_cv_rmse:.2f}")

Best parameters: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best CV RMSE: $66337.37


In [36]:
# Final model with best parameters
final_model = grid_search.best_estimator_
final_model

In [37]:
# Make predictions on validation set
validation_features = validation_df[numeric_features + categorical_features]
validation_predictions = final_model.predict(validation_features)

In [41]:
# Create the result dataframe
base_result = pd.DataFrame({
    'house_id': validation_df['house_id'],
    'price': validation_predictions
})

print("\nFinal predictions:")
base_result.head()


Final predictions:


Unnamed: 0,house_id,price
0,1331375,73922.377409
1,1630115,331841.76776
2,1645745,291001.132192
3,1336775,90475.083759
4,1888274,291001.132192


In [43]:
# Feature importance (if applicable)
if best_model_name in ["Random Forest", "Gradient Boosting", "XGBoost", "LightGBM"]:
    try:
        # Get feature names after preprocessing
        feature_names = []
        for name, transformer, features in preprocessor.transformers_:
            if hasattr(transformer, 'get_feature_names_out'):
                if name == 'cat':
                    feature_names.extend(transformer.get_feature_names_out(features))
                else:
                    feature_names.extend(features)
            else:
                feature_names.extend(features)
        
        # Get importances
        importances = final_model.named_steps['model'].feature_importances_
        
        # Create feature importance dataframe
        if len(importances) == len(feature_names):
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importances
            }).sort_values('Importance', ascending=False)
            
            print("\nFeature Importance:")
            print(importance_df.head(10))
            
            # Plot feature importance
            plt.figure(figsize=(12, 6))
            plt.bar(importance_df['Feature'][:10], importance_df['Importance'][:10])
            plt.xticks(rotation=45, ha='right')
            plt.title(f'Top 10 Feature Importance - {best_model_name}')
            plt.tight_layout()
            plt.savefig('feature_importance.png')
    except Exception as e:
        print("Could not calculate feature importance:", e)