In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

# Generate synthetic real estate data
np.random.seed(0)  # For reproducibility

num_samples = 1000

# Features
size = np.random.randint(1000, 5000, num_samples)  # Size of the house (in square feet)
num_bedrooms = np.random.randint(2, 6, num_samples)  # Number of bedrooms
num_bathrooms = np.random.randint(1, 4, num_samples)  # Number of bathrooms
year_built = np.random.randint(1950, 2020, num_samples)  # Year the house was built
garage_spaces = np.random.randint(0, 3, num_samples)  # Number of garage spaces
has_pool = np.random.choice([0, 1], size=num_samples, p=[0.8, 0.2])  # Whether the house has a pool

# Target variable (price)
price = 50000 * size + 30000 * num_bedrooms + 20000 * num_bathrooms + 10000 * (2022 - year_built) + \
        15000 * garage_spaces + 50000 * has_pool + np.random.normal(0, 50000, num_samples)

# Create DataFrame
data = pd.DataFrame({
    'Size': size,
    'Bedrooms': num_bedrooms,
    'Bathrooms': num_bathrooms,
    'YearBuilt': year_built,
    'GarageSpaces': garage_spaces,
    'HasPool': has_pool,
    'Price': price
})

# Separate features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

df = pd.DataFrame(X_train_scaled, columns=X.columns)

# Display the first few rows of the preprocessed features
print("Preprocessed Features:")
print(df.head())

Preprocessed Features:
       Size  Bedrooms  Bathrooms  YearBuilt  GarageSpaces   HasPool
0  0.134649  0.436544  -1.299886   0.346774      1.302937 -0.507801
1  0.659080  0.436544  -0.052993  -1.594237     -1.158337  1.969276
2 -0.482067 -1.345268  -1.299886   1.462855      1.302937 -0.507801
3 -0.988751  0.436544  -0.052993  -1.545712      0.072300 -0.507801
4  0.422155  1.327450   1.193900   0.152673      1.302937 -0.507801


In [8]:
def run_ml_pipeline(X_train, X_test, y_train, y_test, models, use_cross_validation=True):
    """
    Run a pipeline of machine learning models on preprocessed data.

    Parameters:
    - X_train: Preprocessed training features
    - X_test: Preprocessed test features
    - y_train: Training target variable
    - y_test: Test target variable
    - models: List of tuples (model_name, model_instance, model_parameters)
    - use_cross_validation: Whether to use cross-validation or model scoring
    """

    score_test = []
    score_training = []
    model_names = []

    for model_name, model, params in models:
        # Create a pipeline for each model
        pipeline = Pipeline([
            ('model', model(**params))
        ])

        if use_cross_validation:
            # Evaluate the model using cross-validation
            scores = cross_val_score(pipeline, X_train, y_train, cv=5)
            mean_score = scores.mean()
            std_score = scores.std()
            print(f"Model: {model_name}")
            print(f"Mean Accuracy: {mean_score:.2f}")
            print(f"Standard Deviation: {std_score}")
            print("-" * 40)
            score_training.append(mean_score)
            score_test.append(np.nan)  # Cross-validation doesn't provide test scores
        else:
            # Fit the model and compute scores on training and test sets
            train_score = pipeline.fit(X_train, y_train).score(X_train, y_train)
            test_score = pipeline.score(X_test, y_test)
            print(f"Model: {model_name}")
            print(f"Training Score: {train_score:.2f}")
            print(f"Test Score: {test_score:.2f}")
            print("-" * 40)
            score_training.append(train_score)
            score_test.append(test_score)

        model_names.append(model_name)

    result = pd.DataFrame({'Model Name': model_names, 'Training Score': score_training, 'Test Score': score_test})
    print(result)

# Define the list of models to run in the pipeline
models = [
    ('Random Forest', RandomForestRegressor, {'n_estimators': 100}),
    ('Gradient Boosting', GradientBoostingRegressor, {'n_estimators': 100}),
    ('Ridge Regression', Ridge, {'alpha': 1.0}),
    ('Lasso Regression', Lasso, {'alpha': 1.0}),
    ('ElasticNet Regression', ElasticNet, {'alpha': 1.0, 'l1_ratio': 0.5}),
    ('Decision Tree', DecisionTreeRegressor, {'max_depth': 5}),
    ('Extra Trees', ExtraTreesRegressor, {'n_estimators': 100}),
    ('KNN', KNeighborsRegressor, {'n_neighbors': 5}),
    ('Gaussian Process', GaussianProcessRegressor, {})
]

run_ml_pipeline(X_train_scaled, X_test_scaled, y_train, y_test, models, use_cross_validation=True)

Model: Random Forest
Mean Accuracy: 1.00
Standard Deviation: 9.284923499238645e-06
----------------------------------------
Model: Gradient Boosting
Mean Accuracy: 1.00
Standard Deviation: 1.5149822224095716e-05
----------------------------------------
Model: Ridge Regression
Mean Accuracy: 1.00
Standard Deviation: 2.4336335721415574e-07
----------------------------------------
Model: Lasso Regression
Mean Accuracy: 1.00
Standard Deviation: 7.474631241926018e-08
----------------------------------------
Model: ElasticNet Regression
Mean Accuracy: 0.89
Standard Deviation: 0.0018306667191352543
----------------------------------------
Model: Decision Tree
Mean Accuracy: 1.00
Standard Deviation: 4.54122054475033e-05
----------------------------------------
Model: Extra Trees
Mean Accuracy: 1.00
Standard Deviation: 1.3361107792594644e-06
----------------------------------------
Model: KNN
Mean Accuracy: 0.92
Standard Deviation: 0.008744242001102652
----------------------------------------
M