## 1. Data Preprocessing

This function handles merging, encoding, and splitting the dataset into train and test sets.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

def preprocess_data(file_path, target_column):
    # Load the dataset
    df = pd.read_excel(file_path)

    # Separate categorical and numerical features
    categorical_features = df.select_dtypes(include=['object']).columns
    numerical_features = df.select_dtypes(include=['number']).columns

    # Target and features
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # One-hot encode categorical features
    X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, X_encoded.columns

## 2. Model Training and Evaluation

This function trains, tunes, and evaluates multiple models dynamically. It returns the best model based on performance metrics.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import joblib

def train_and_evaluate_models(models, X_train, X_test, y_train, y_test):
    results = {}
    best_models = {}

    # Function to evaluate each model
    def evaluate_model(model_name, model, param_distributions):
        if param_distributions:
            search = RandomizedSearchCV(model, param_distributions, n_iter=10, scoring='neg_mean_absolute_percentage_error', cv=5, n_jobs=-1, verbose=1, random_state=42)
            search.fit(X_train, y_train)
            best_model = search.best_estimator_
            print(f"Best Params for {model_name}: {search.best_params_}")
        else:
            model.fit(X_train, y_train)
            best_model = model

        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)

        metrics = {
            'MSE_train': mean_squared_error(y_train, y_train_pred),
            'MAE_train': mean_absolute_error(y_train, y_train_pred),
            'R2_train': r2_score(y_train, y_train_pred),
            'MAPE_train': mean_absolute_percentage_error(y_train, y_train_pred) * 100,
            'MSE_test': mean_squared_error(y_test, y_test_pred),
            'MAE_test': mean_absolute_error(y_test, y_test_pred),
            'R2_test': r2_score(y_test, y_test_pred),
            'MAPE_test': mean_absolute_percentage_error(y_test, y_test_pred) * 100
        }

        results[model_name] = metrics
        return best_model

    for model_name, (model, param_distributions) in models.items():
        best_model = evaluate_model(model_name, model, param_distributions)
        best_models[model_name] = best_model

    results_df = pd.DataFrame(results).T
    print(results_df)

    best_model_name = results_df['R2_test'].idxmax()
    print(f"The best model is: {best_model_name}")

    joblib.dump(best_models[best_model_name], f'{best_model_name.lower().replace(" ", "_")}_model.pkl')
    
    return best_models[best_model_name], results_df

## 3. Dynamic Model Deployment

This function can be used to load and use the best model for deployment.

In [None]:
def deploy_model(model_file, encoded_columns_file, new_data):
    # Load the model
    model = joblib.load(model_file)
    encoded_columns = joblib.load(encoded_columns_file)

    # Ensure new data matches the encoded structure
    new_data_encoded = pd.get_dummies(new_data)
    for col in encoded_columns:
        if col not in new_data_encoded.columns:
            new_data_encoded[col] = 0
    new_data_encoded = new_data_encoded[encoded_columns]

    # Predict using the model
    predictions = model.predict(new_data_encoded)
    return predictions

## 4. Main Execution Workflow

This part brings everything together: from data merging, model training, evaluation, and deployment.

In [None]:
# Main code to run everything
if __name__ == "__main__":
    # Step 1: Preprocess the data
    X_train, X_test, y_train, y_test, encoded_columns = preprocess_data('modified_dataset/cleaned_car_data.xlsx', 'price')

    # Step 2: Define models and parameter grids
    from sklearn.linear_model import LinearRegression, Ridge, Lasso
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
    from xgboost import XGBRegressor
    from scipy.stats import uniform

    models = {
        'Linear Regression': (LinearRegression(), {}),
        'Ridge Regression': (Ridge(), {'alpha': [0.1, 1.0, 10.0]}),
        'Lasso Regression': (Lasso(), {'alpha': [0.1, 1.0, 10.0]}),
        'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [3, 5, 10]}),
        'Random Forest': (RandomForestRegressor(random_state=42), {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }),
        'XGBoost': (XGBRegressor(objective='reg:squarederror', random_state=42), {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 4, 5],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': uniform(0.5, 0.5),
            'colsample_bytree': uniform(0.5, 0.5),
            'alpha': [0, 0.1, 0.5, 1],
            'lambda': [0, 0.1, 0.5, 1]
        }),
        'AdaBoost': (AdaBoostRegressor(random_state=42), {'n_estimators': [50, 100, 150]})
    }

    # Step 3: Train and evaluate models
    best_model, results_df = train_and_evaluate_models(models, X_train, X_test, y_train, y_test)

    # Step 4: Save encoded columns for deployment
    joblib.dump(encoded_columns, 'encoded_columns.pkl')

    # Deployment example: Predict on new data
    # Assume new_data is a DataFrame with similar structure
    new_data = pd.DataFrame({
        'make': ['Ford'],
        'year': [2020],
        'fuel': ['Diesel'],
        'mileage': [25.0],
        'transmission': ['Manual'],
        'engine': [1500]
    })

    predictions = deploy_model('best_model.pkl', 'encoded_columns.pkl', new_data)
    print(f"Predictions: {predictions}")