In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC, SVR
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor,XGBRFRegressor
from lightgbm import LGBMRegressor,LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler,OneHotEncoder,MinMaxScaler

from sklearn.metrics import accuracy_score,cohen_kappa_score,confusion_matrix,mean_squared_error,r2_score,\
root_mean_squared_error,recall_score,roc_auc_score,roc_curve,mean_absolute_error

from sklearn.linear_model import LogisticRegression,LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV

from sklearn.ensemble import AdaBoostRegressor,BaggingRegressor,GradientBoostingRegressor,\
RandomForestRegressor,VotingRegressor,HistGradientBoostingRegressor

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message="X does not have valid feature names")

In [3]:
df = pd.read_csv('cars_2010_2020.csv')
cars_val = pd.read_csv('cars_validation.csv')

In [4]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Size (L),Fuel Type,Price (USD)
0,Volkswagen,Jetta,2010,4.2,Petrol,54073.09
1,Honda,Pilot,2017,4.2,Hybrid,44924.91
2,Nissan,Murano,2011,4.2,Hybrid,76963.44
3,Toyota,RAV4,2010,2.4,Petrol,30871.25
4,Nissan,Altima,2010,3.6,Petrol,72037.65


In [5]:
df = df.rename(columns={'Engine Size (L)': 'Engine_Size_Liter', 'Fuel Type': 'Fuel_Type', 
                        'Price (USD)': 'Price_USD'})

In [6]:
df.head()

Unnamed: 0,Make,Model,Year,Engine_Size_Liter,Fuel_Type,Price_USD
0,Volkswagen,Jetta,2010,4.2,Petrol,54073.09
1,Honda,Pilot,2017,4.2,Hybrid,44924.91
2,Nissan,Murano,2011,4.2,Hybrid,76963.44
3,Toyota,RAV4,2010,2.4,Petrol,30871.25
4,Nissan,Altima,2010,3.6,Petrol,72037.65


In [22]:
#m = df[['Make','Model','Year','Engine_Size_Liter','Fuel_Type','Price_USD']]
#m.to_csv('cars_validation.csv', index=False)

In [7]:
# Basic exploration
print(f"Training set shape: {df.shape}")
print(f"Validation set shape: {cars_val.shape}")

Training set shape: (10000, 6)
Validation set shape: (10000, 6)


In [8]:
# Define features
numeric_features = ['Year', 'Engine_Size_Liter']
categorical_features = ['Make', 'Model', 'Fuel_Type']

In [9]:
X = df[numeric_features + categorical_features]
y = df['Price_USD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Create preprocessor
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [11]:
# Function to evaluate models
def evaluate_model(model_name, model, X_train, X_test, y_train, y_test):
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Fit
    pipeline.fit(X_train, y_train)
    
    # Predict
    pred = pipeline.predict(X_test)
    
    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test, pred))
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores.mean())
    
    print(f"{model_name}:")
    print(f"  RMSE: ${rmse:.2f}")
    print(f"  MAE: ${mae:.2f}")
    print(f"  R²: {r2:.4f}")
    print(f"  CV RMSE: ${cv_rmse:.2f}")
    print("-----------------------------")
    
    return pipeline, rmse, r2, cv_rmse

# Define models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01, max_iter=10000),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "SVR": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42, force_col_wise= True)
}

In [12]:
# Evaluate all models
results = {}
best_rmse = float('inf')
best_model_name = None
best_pipeline = None

for name, model in models.items():
    try:
        print(f"Training {name}...")
        pipeline, rmse, r2, cv_rmse = evaluate_model(name, model, X_train, X_test, y_train, y_test)
        results[name] = {'rmse': rmse, 'r2': r2, 'cv_rmse': cv_rmse, 'pipeline': pipeline}
        
        if cv_rmse < best_rmse:
            best_rmse = cv_rmse
            best_model_name = name
            best_pipeline = pipeline
            
    except Exception as e:
        print(f"Error with {name}: {e}")

print(f"\nBest model: {best_model_name} with CV RMSE: ${best_rmse:.2f}")

Training Linear Regression...
Linear Regression:
  RMSE: $18665.05
  MAE: $16088.53
  R²: -0.0101
  CV RMSE: $18694.35
-----------------------------
Training Ridge Regression...
Ridge Regression:
  RMSE: $18664.09
  MAE: $16088.00
  R²: -0.0100
  CV RMSE: $18693.70
-----------------------------
Training Lasso Regression...
Lasso Regression:
  RMSE: $18665.02
  MAE: $16088.51
  R²: -0.0101
  CV RMSE: $18694.32
-----------------------------
Training ElasticNet...
ElasticNet:
  RMSE: $18588.25
  MAE: $16040.73
  R²: -0.0018
  CV RMSE: $18648.92
-----------------------------
Training Random Forest...
Random Forest:
  RMSE: $20149.07
  MAE: $17055.63
  R²: -0.1771
  CV RMSE: $19940.66
-----------------------------
Training Gradient Boosting...
Gradient Boosting:
  RMSE: $18706.19
  MAE: $16147.59
  R²: -0.0145
  CV RMSE: $18722.42
-----------------------------
Training SVR...
SVR:
  RMSE: $18573.47
  MAE: $16031.64
  R²: -0.0002
  CV RMSE: $18643.07
-----------------------------
Training KN

In [13]:
# Create results dataframe for easy comparison
comparison_df = pd.DataFrame({
    'Model': list(results.keys()), 
    'RMSE': [results[model]['rmse'] for model in results],
    'R²': [results[model]['r2'] for model in results],
    'CV RMSE': [results[model]['cv_rmse'] for model in results]
}).sort_values('CV RMSE')

print("\nModel Comparison:")
print(comparison_df)


Model Comparison:
               Model          RMSE        R²       CV RMSE
6                SVR  18573.466266 -0.000188  18643.071660
3         ElasticNet  18588.248381 -0.001781  18648.921099
1   Ridge Regression  18664.089267 -0.009972  18693.701041
2   Lasso Regression  18665.021717 -0.010073  18694.322084
0  Linear Regression  18665.048141 -0.010076  18694.348770
5  Gradient Boosting  18706.190487 -0.014534  18722.419814
9           LightGBM  18887.037826 -0.034245  18962.309215
8            XGBoost  19148.978298 -0.063132  19033.733267
4      Random Forest  20149.065039 -0.177079  19940.664663
7                KNN  20196.285216 -0.182603  20416.229588


In [17]:
# Hyperparameter tuning for the best model
if best_model_name == "Random Forest":
    print("\nPerforming hyperparameter tuning for Random Forest...")
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10]
    }
    
elif best_model_name == "Gradient Boosting":
    print("\nPerforming hyperparameter tuning for Gradient Boosting...")
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    }
    
elif best_model_name == "XGBoost":
    print("\nPerforming hyperparameter tuning for XGBoost...")
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7],
        'model__colsample_bytree': [0.7, 0.8, 0.9]
    }
    
elif best_model_name == "LightGBM":
    print("\nPerforming hyperparameter tuning for LightGBM...")
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__num_leaves': [31, 50, 70],
        'model__feature_fraction': [0.7, 0.8, 0.9]
    }
    
else:
    # Default to Ridge if best model is not one of the above
    print(f"\nPerforming hyperparameter tuning for {best_model_name}...")
    param_grid = {
    'model__C': [0.1, 1, 10, 100],
    'model__kernel': ['linear', 'rbf', 'poly'],
    'model__gamma': ['scale', 'auto', 0.1, 0.01]
}

# GridSearch with cross-validation
grid_search = GridSearchCV(
    best_pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(X, y)

# Best parameters and score
print("Best parameters:", grid_search.best_params_)
best_cv_rmse = np.sqrt(-grid_search.best_score_)
print(f"Best CV RMSE: ${best_cv_rmse:.2f}")


Performing hyperparameter tuning for SVR...
Best parameters: {'model__C': 1, 'model__gamma': 0.1, 'model__kernel': 'poly'}
Best CV RMSE: $18643.05


In [20]:
cars_val.head(2)

Unnamed: 0,Make,Model,Year,Engine_Size_Liter,Fuel_Type,Price_USD
0,Volkswagen,Jetta,2010,4.2,Petrol,54073.09
1,Honda,Pilot,2017,4.2,Hybrid,44924.91


In [25]:
# Final model with best parameters
final_model = grid_search.best_estimator_

# Make predictions on validation set
make_features = cars_val[numeric_features + categorical_features]
make_predictions = final_model.predict(make_features)

# Create the result dataframe
base_result = pd.DataFrame({
    'Make': cars_val['Make'],
    'Model': cars_val['Model'],
    'Year': cars_val['Year'],
    'Engine_Size_Liter': cars_val['Engine_Size_Liter'],
    'Fuel_Type': cars_val['Fuel_Type'],
    'Price_USD': make_predictions
})

print("\nFinal predictions:")
display(base_result.head())


Final predictions:


Unnamed: 0,Make,Model,Year,Engine_Size_Liter,Fuel_Type,Price_USD
0,Volkswagen,Jetta,2010,4.2,Petrol,47270.070663
1,Honda,Pilot,2017,4.2,Hybrid,47270.708717
2,Nissan,Murano,2011,4.2,Hybrid,47269.969198
3,Toyota,RAV4,2010,2.4,Petrol,47270.953127
4,Nissan,Altima,2010,3.6,Petrol,47270.882048


In [27]:
# Feature importance (if applicable)
if best_model_name in ["SVR", "Random Forest", "Gradient Boosting", "XGBoost", "LightGBM"]:
    try:
        # Get feature names after preprocessing
        feature_names = []
        for name, transformer, features in preprocessor.transformers_:
            if hasattr(transformer, 'get_feature_names_out'):
                if name == 'cat':
                    feature_names.extend(transformer.get_feature_names_out(features))
                else:
                    feature_names.extend(features)
            else:
                feature_names.extend(features)
        
        # Get importances
        importances = final_model.named_steps['model'].feature_importances_
        
        # Create feature importance dataframe
        if len(importances) == len(feature_names):
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importances
            }).sort_values('Importance', ascending=False)
            
            print("\nFeature Importance:")
            print(importance_df.head(10))
            
            # Plot feature importance
            plt.figure(figsize=(12, 6))
            plt.bar(importance_df['Feature'][:10], importance_df['Importance'][:10])
            plt.xticks(rotation=45, ha='right')
            plt.title(f'Top 10 Feature Importance - {best_model_name}')
            plt.tight_layout()
            plt.savefig('feature_importance.png')
    except Exception as e:
        print("Could not calculate feature importance:", e)

Could not calculate feature importance: 'SVR' object has no attribute 'feature_importances_'
