In [11]:
# Import Required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import os

In [12]:
# Load and Inspect Data
print("File exists:", os.path.exists(r'C:\\Users\\pklba\\OneDrive\\Desktop\\2025-Y2-S1-MLB-B11G2-03_Vehicle_Price_Prediction-main\\results\\outputs\\final_output.csv'))
data = pd.read_csv(r'C:\\Users\\pklba\\OneDrive\\Desktop\\2025-Y2-S1-MLB-B11G2-03_Vehicle_Price_Prediction-main\\results\\outputs\\final_output.csv')
print(data.head())
print("Columns in the dataset:", data.columns)
data.columns = data.columns.str.strip()
print("Cleaned Columns:", data.columns)
print(data.dtypes)

# Preprocess Data
X = data.drop(columns=['selling_price'])
y = data['selling_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a pipeline and perform RandomizedSearchCV for SVR hyperparameters
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

# Parameter distributions (use lists to avoid external scipy dependency)
param_dist = {
    'svr__C': np.logspace(-2, 2, 50),            # 0.01 to 100
    'svr__epsilon': np.linspace(0.01, 0.5, 20),  # 0.01 to 0.5
    'svr__kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
    'svr__gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 10)),
    'svr__degree': [2, 3, 4],
    'svr__coef0': np.linspace(0.0, 1.0, 10)
}

rs = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=40,                 # change higher/lower depending on time budget
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Fit randomized search on the training data
rs.fit(X_train, y_train)
print("Best Parameters:", rs.best_params_)

# Best pipeline (includes scaler and tuned SVR)
best_pipeline = rs.best_estimator_
print("Best pipeline:")
print(best_pipeline)

# Evaluate on the hold-out test set
y_pred = best_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Test Mean Squared Error: {mse}")
print(f"Test R^2 Score: {r2}")

# Save best_pipeline variable for later cells to use
best_svr = best_pipeline  # keep compatibility with existing save cell that expects best_svr

File exists: True
    car_age      year  owner  km_driven  brand_name_Mahindra  brand_name_Tata  \
0  0.684211  0.315789      0   0.421683                  0.0              0.0   
1  0.684211  0.315789      0   0.301201                  0.0              0.0   
2  0.421053  0.578947      0   0.602407                  0.0              0.0   
3  0.157895  0.842105      0   0.277104                  0.0              0.0   
4  0.315789  0.684211      2   0.849397                  0.0              0.0   

   brand_name_Toyota  brand_popularity  brand_name_Honda  \
0                0.0          1.000000               0.0   
1                0.0          1.000000               0.0   
2                0.0          0.578947               0.0   
3                0.0          0.026316               0.0   
4                0.0          0.188259               1.0   

   brand_name_Chevrolet  selling_price  
0                   0.0   60000.000000  
1                   0.0  504355.832295  
2          

In [13]:
# Save and clearly display the trained pipeline (scaler + SVR)
from joblib import dump
import os

# Path to save the trained model (updated to models/results/outputs/IT24103116_Support_Vector_Regression)
base_out_dir = r'C:\\Users\\pklba\\OneDrive\\Desktop\\2025-Y2-S1-MLB-B11G2-03_Vehicle_Price_Prediction-main\\models\\results\\outputs\\IT24103116_Support_Vector_Regression'
os.makedirs(base_out_dir, exist_ok=True)
model_path = os.path.join(base_out_dir, 'svr_pipeline.joblib')

# Save model to disk
try:
    dump(best_svr, model_path)
    print('Pipeline saved to:', model_path)
except Exception as e:
    print('Failed to save pipeline:', e)

# Print clear representation and parameters
print('\nModel type:', type(best_svr))
print('\nModel repr:')
print(best_svr)

print('\nModel parameters:')
for k, v in best_svr.get_params().items():
    print(f" - {k}: {v}")

Pipeline saved to: C:\Users\pklba\OneDrive\Desktop\2025-Y2-S1-MLB-B11G2-03_Vehicle_Price_Prediction-main\models\results\outputs\IT24103116_Support_Vector_Regression\svr_pipeline.joblib

Model type: <class 'sklearn.pipeline.Pipeline'>

Model repr:
Pipeline(steps=[('scaler', StandardScaler()),
                ('svr',
                 SVR(C=np.float64(100.0), coef0=np.float64(0.1111111111111111),
                     degree=2, epsilon=np.float64(0.16473684210526315),
                     gamma=np.float64(10.0), kernel='poly'))])

Model parameters:
 - memory: None
 - steps: [('scaler', StandardScaler()), ('svr', SVR(C=np.float64(100.0), coef0=np.float64(0.1111111111111111), degree=2,
    epsilon=np.float64(0.16473684210526315), gamma=np.float64(10.0),
    kernel='poly'))]
 - transform_input: None
 - verbose: False
 - scaler: StandardScaler()
 - svr: SVR(C=np.float64(100.0), coef0=np.float64(0.1111111111111111), degree=2,
    epsilon=np.float64(0.16473684210526315), gamma=np.float64(10.0),


In [14]:
# Train multiple SVR-only variations, compare, and save best pipelines
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from joblib import dump
import pandas as pd
import os
import numpy as np

# Ensure output dir exists inside the model-specific folder
base_out_dir = r'C:\Users\pklba\OneDrive\Desktop\2025-Y2-S1-MLB-B11G2-03_Vehicle_Price_Prediction-main\models\results\outputs\IT24103116_Support_Vector_Regression'
os.makedirs(base_out_dir, exist_ok=True)

# Define SVR pipeline
pipe = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])

# Parameter distributions for SVR (reduced search space for speed)
param_dist = {
    'svr__C': np.logspace(-2, 2, 20),
    'svr__epsilon': np.linspace(0.01, 0.5, 10),
    'svr__kernel': ['rbf', 'linear'],
    'svr__gamma': ['scale', 'auto'] + list(np.logspace(-3, 0, 5)),
    'svr__degree': [2, 3],
    'svr__coef0': np.linspace(0.0, 1.0, 5)
}

rs_svr = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=20,                 # reduced iterations for speed
    cv=3,                      # fewer folds
    scoring='neg_mean_squared_error',
    n_jobs=2,                  # limit parallel jobs to reduce overhead
    random_state=42,
    verbose=1
)

# Fit randomized search on the training data
import time
start = time.time()
rs_svr.fit(X_train, y_train)
end = time.time()
print(f"SVR RandomizedSearch finished in {end - start:.1f}s")
print("Best Parameters (SVR search):", rs_svr.best_params_)

# Extract cv results and select top-N per kernel
cv = pd.DataFrame(rs_svr.cv_results_)
cv['kernel'] = cv['params'].apply(lambda p: p.get('svr__kernel') or p.get('kernel'))

variants = []
per_kernel_topN = 2
# For each kernel, pick top-N configs
for kernel in ['linear', 'rbf']:
    subset = cv[cv['kernel'] == kernel]
    if subset.empty:
        print(f'No candidates found for kernel {kernel}')
        continue
    topk = subset.sort_values('mean_test_score', ascending=False).head(per_kernel_topN)
    for idx, row in topk.iterrows():
        params = row['params']
        # normalize keys to set on pipeline
        mapped = {('svr__' + k) if not k.startswith('svr__') else k: v for k, v in params.items()}
        # build pipeline, set params, refit on full training set
        p = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])
        try:
            p.set_params(**mapped)
        except Exception as e:
            print('Failed to set params for kernel variant:', e)
            continue
        p.fit(X_train, y_train)
        preds = p.predict(X_test)
        mse = mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)
        filename = f"svr_{kernel}_variant_{idx}.joblib"
        filepath = os.path.join(base_out_dir, filename)
        try:
            dump(p, filepath)
            saved = True
        except Exception as e:
            print('Failed to save variant', filename, e)
            saved = False
        variants.append({'kernel': kernel, 'idx': int(idx), 'params': params, 'mse': float(mse), 'r2': float(r2), 'path': filepath if saved else ''})

# Also save overall top N
topN = 6
top_overall = cv.sort_values('mean_test_score', ascending=False).head(topN)
for idx, row in top_overall.iterrows():
    params = row['params']
    mapped = {('svr__' + k) if not k.startswith('svr__') else k: v for k, v in params.items()}
    p = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])
    try:
        p.set_params(**mapped)
    except Exception as e:
        print('Failed to set params for top variant:', e)
        continue
    p.fit(X_train, y_train)
    preds = p.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    filename = f"svr_top_variant_{idx}.joblib"
    filepath = os.path.join(base_out_dir, filename)
    try:
        dump(p, filepath)
        saved = True
    except Exception as e:
        print('Failed to save variant', filename, e)
        saved = False
    variants.append({'kernel': params.get('svr__kernel') or params.get('kernel'), 'idx': int(idx), 'params': params, 'mse': float(mse), 'r2': float(r2), 'path': filepath if saved else ''})

# Write summary
summary_df = pd.DataFrame(variants)
summary_csv = os.path.join(base_out_dir, 'svr_variations_comparison.csv')
summary_df.to_csv(summary_csv, index=False)
print('\nSaved SVR variants summary to:', summary_csv)
print(summary_df)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


KeyboardInterrupt: 

In [None]:
# Create multiple SVR variations from the existing RandomizedSearchCV results (rs)
# - Saves best per kernel and top-N overall SVR variants
# - Writes a CSV summary
from joblib import dump
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

base_out_dir = r'C:\Users\pklba\OneDrive\Desktop\2025-Y2-S1-MLB-B11G2-03_Vehicle_Price_Prediction-main\models\results\outputs\IT24103116_Support_Vector_Regression'
os.makedirs(base_out_dir, exist_ok=True)

if 'rs' not in globals():
    print('RandomizedSearchCV object `rs` not found in the notebook. Run the SVR tuning cell first.')
else:
    # load cv results into DataFrame
    cv = pd.DataFrame(rs.cv_results_)
    # params column contains dicts
    if cv.empty:
        print('cv results are empty')
    else:
        # filter rows to params that belong to SVR (contain keys like 'C'/'epsilon'/'kernel')
        def is_svr_param_dict(d):
            keys = set(d.keys())
            svr_indicators = {'C','epsilon','kernel','gamma','degree','coef0'}
            prefixed_keys = {k for k in keys if k.split('__')[-1] in svr_indicators}
            return len(prefixed_keys) > 0 or len(keys & svr_indicators) > 0

        cv['is_svr'] = cv['params'].apply(lambda d: is_svr_param_dict(d))
        svr_rows = cv[cv['is_svr']].copy()
        if svr_rows.empty:
            print('No SVR-like parameter rows found in rs.cv_results_. Make sure rs is the SVR search.')
        else:
            # helper to build and fit pipeline from param dict
            def build_and_save_variant(param_dict, save_name):
                pipe = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])
                # map param keys to 'svr__' format if needed
                mapped = {}
                for k, v in param_dict.items():
                    if '__' in k:
                        mapped[k.replace('svr__', 'svr__')] = v
                    else:
                        mapped['svr__' + k] = v
                try:
                    pipe.set_params(**mapped)
                except Exception as e:
                    print('Failed to set params for', save_name, e)
                pipe.fit(X_train, y_train)
                preds = pipe.predict(X_test)
                mse = mean_squared_error(y_test, preds)
                r2 = r2_score(y_test, preds)
                save_path = os.path.join(base_out_dir, save_name)
                try:
                    dump(pipe, save_path)
                    saved = True
                except Exception as e:
                    print('Failed to save', save_name, e)
                    saved = False
                return {'save_name': save_name, 'save_path': save_path if saved else '', 'mse': mse, 'r2': r2, 'params': param_dict}

            results = []
            # Save best per kernel
            kernels = ['linear', 'rbf', 'poly', 'sigmoid']
            for k in kernels:
                matches = svr_rows[svr_rows['params'].apply(lambda d: any((v == k) for v in d.values()))]
                if not matches.empty:
                    top_row = matches.sort_values('mean_test_score', ascending=False).iloc[0]
                    param_dict = top_row['params']
                    name = f"svr_{k}_variant.joblib"
                    print('Building variant for kernel', k)
                    res = build_and_save_variant(param_dict, name)
                    results.append(res)
                else:
                    print('No SVR candidates found for kernel', k)

            # Save top-N overall hyperparameter variants
            topN = 5
            top_rows = svr_rows.sort_values('mean_test_score', ascending=False).head(topN)
            for i, (_, row) in enumerate(top_rows.iterrows(), start=1):
                param_dict = row['params']
                name = f"svr_top{ i }_variant.joblib"
                print('Building top', i, 'variant')
                res = build_and_save_variant(param_dict, name)
                results.append(res)

            # write summary CSV
            summary_df = pd.DataFrame(results)
            summary_csv = os.path.join(base_out_dir, 'svr_variations_comparison.csv')
            summary_df.to_csv(summary_csv, index=False)
            print('\nSaved SVR variations and summary to:', base_out_dir)
            print(summary_df)

No SVR-like parameter rows found in rs.cv_results_. Make sure rs is the SVR search.
