In [23]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.datasets import load_iris

from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

# Generate synthetic real estate data
np.random.seed(0)  # For reproducibility

num_samples = 1000

# Features
size = np.random.randint(1000, 5000, num_samples)  # Size of the house (in square feet)
num_bedrooms = np.random.randint(2, 6, num_samples)  # Number of bedrooms
num_bathrooms = np.random.randint(1, 4, num_samples)  # Number of bathrooms
year_built = np.random.randint(1950, 2020, num_samples)  # Year the house was built
garage_spaces = np.random.randint(0, 3, num_samples)  # Number of garage spaces
has_pool = np.random.choice([0, 1], size=num_samples, p=[0.8, 0.2])  # Whether the house has a pool

# Target variable (price)
price = 50000 * size + 30000 * num_bedrooms + 20000 * num_bathrooms + 10000 * (2022 - year_built) + \
        15000 * garage_spaces + 50000 * has_pool + np.random.normal(0, 50000, num_samples)

# Create DataFrame
data = pd.DataFrame({
    'Size': size,
    'Bedrooms': num_bedrooms,
    'Bathrooms': num_bathrooms,
    'YearBuilt': year_built,
    'GarageSpaces': garage_spaces,
    'HasPool': has_pool,
    'Price': price
})

# Separate features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

df = pd.DataFrame(X_scaled, columns=X.columns)

# Display the first few rows of the preprocessed features
print("Preprocessed Features:")
print(df.head())

Preprocessed Features:
       Size  Bedrooms  Bathrooms  YearBuilt  GarageSpaces   HasPool
0  0.643067  0.446879  -1.258811  -0.864921      0.055101 -0.517134
1  0.533564  0.446879  -0.021042   0.211643      0.055101 -0.517134
2 -0.302164 -1.326450  -1.258811   0.700990     -1.169361 -0.517134
3  1.109112  1.333543  -0.021042   1.288206      1.279563 -0.517134
4 -1.018751  0.446879  -1.258811  -1.354268      0.055101 -0.517134


In [24]:
def run_ml_pipeline(X_train, X_test, y_train, y_test, models):
    """
    Run a pipeline of machine learning models on preprocessed data.

    Parameters:
    - X: Preprocessed features
    - y: Target variable
    - models: List of tuples (model_name, model_instance, model_parameters)
    """

    score_test = []
    score_traning = []
    model_nm = []

    for model_name, model, params in models:
        # Create a pipeline for each model
        pipeline = Pipeline([
            ('model', model(**params))
        ])
        model.fit(X_train, y_train_encoded)
        train_score = model.score(X_train_scaled,y_train_encoded)
        test_score = model.score(X_test_scaled,y_test_encoded)
        model_nm.append(model_name)
        score_traning.append(train_score)
        score_test.append(test_score)
    result = pd.DataFrame({'Model Name':model_nm, 'Training Score':score_traning,
            'Test_score':score_test})
    print(result)
        
        # # Evaluate the model using cross-validation
        # scores = cross_val_score(pipeline, X, y, cv=5)
        
        # # Print the results
        # print(f"Model: {model_name}")
        # print(f"Mean Accuracy: {scores.mean():.2f}")
        # print(f"Standard Deviation: {scores.std():.2f}")
        # print("-" * 40)

# Define the list of models to run in the pipeline
models = [
    ('Random Forest', RandomForestRegressor, {'n_estimators': 100}),
    ('Gradient Boosting', GradientBoostingRegressor, {'n_estimators': 100}),
    ('Ridge Regression', Ridge, {'alpha': 1.0}),
    ('Lasso Regression', Lasso, {'alpha': 1.0}),
    ('ElasticNet Regression', ElasticNet, {'alpha': 1.0, 'l1_ratio': 0.5}),
    ('Decision Tree', DecisionTreeRegressor, {'max_depth': 5}),
    ('Extra Trees', ExtraTreesRegressor, {'n_estimators': 100}),
    ('KNN', KNeighborsRegressor, {'n_neighbors': 5}),
    ('Gaussian Process', GaussianProcessRegressor, {})
]

run_ml_pipeline(df, y, models)

Model: Random Forest
Mean Accuracy: 1.00
Standard Deviation: 0.00
----------------------------------------
Model: Gradient Boosting
Mean Accuracy: 1.00
Standard Deviation: 0.00
----------------------------------------
Model: Ridge Regression
Mean Accuracy: 1.00
Standard Deviation: 0.00
----------------------------------------
Model: Lasso Regression
Mean Accuracy: 1.00
Standard Deviation: 0.00
----------------------------------------
Model: ElasticNet Regression
Mean Accuracy: 0.89
Standard Deviation: 0.00
----------------------------------------
Model: Decision Tree
Mean Accuracy: 1.00
Standard Deviation: 0.00
----------------------------------------
Model: Extra Trees
Mean Accuracy: 1.00
Standard Deviation: 0.00
----------------------------------------
Model: KNN
Mean Accuracy: 0.93
Standard Deviation: 0.01
----------------------------------------
Model: Gaussian Process
Mean Accuracy: 0.96
Standard Deviation: 0.02
----------------------------------------
