In [2]:
# lasso_regression_car_data.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the data
data = pd.read_csv('Car Data P.csv')

# Display basic info
print("Dataset Shape:", data.shape)
print("\nColumns:", data.columns.tolist())
print("\nFirst few rows:")
print(data.head())

# Identify target variable (assuming we want to predict city_mpg)
target = 'city_mpg'
print(f"\nTarget variable: {target}")

# Identify feature columns
# Exclude non-numeric and target-related columns
exclude_cols = ['city_mpg', 'class', 'make', 'model', 'drive', 'fuel_type',
                'transmission', 'size_category', 'drivetrain_type',
                'fuel_efficiency_category', 'mpg_diff']

# Also exclude the target variable
feature_cols = [col for col in data.columns if col not in exclude_cols and col != target]

# Check for missing values
print(f"\nMissing values in each column:")
print(data[feature_cols + [target]].isnull().sum())

# Prepare features and target
X = data[feature_cols]
y = data[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTraining set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# LASSO Regression with cross-validation for alpha selection
from sklearn.linear_model import LassoCV

# Create and fit LassoCV to find optimal alpha
lasso_cv = LassoCV(cv=5, random_state=42, max_iter=10000)
lasso_cv.fit(X_train_scaled, y_train)

print(f"\nOptimal alpha from cross-validation: {lasso_cv.alpha_:.6f}")

# Train final Lasso model with optimal alpha
lasso_model = Lasso(alpha=lasso_cv.alpha_, max_iter=10000, random_state=42)
lasso_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_train = lasso_model.predict(X_train_scaled)
y_pred_test = lasso_model.predict(X_test_scaled)

# Calculate metrics
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("\n" + "="*60)
print("LASSO REGRESSION RESULTS")
print("="*60)
print(f"Training MSE: {train_mse:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Training MAE: {train_mae:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Training R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")

# Analyze coefficients
coefficients = pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': lasso_model.coef_,
    'Absolute_Value': np.abs(lasso_model.coef_)
})

# Sort by absolute value
coefficients = coefficients.sort_values('Absolute_Value', ascending=False)

print("\n" + "="*60)
print("TOP 20 MOST IMPORTANT FEATURES (by absolute coefficient value)")
print("="*60)
print(coefficients.head(20).to_string(index=False))

# Count of non-zero coefficients
non_zero_coeff = np.sum(lasso_model.coef_ != 0)
zero_coeff = np.sum(lasso_model.coef_ == 0)
print(f"\nNumber of non-zero coefficients: {non_zero_coeff}")
print(f"Number of zero coefficients (features eliminated): {zero_coeff}")
print(f"Percentage of features eliminated: {(zero_coeff/len(feature_cols))*100:.2f}%")

# Plot feature importance
plt.figure(figsize=(12, 8))
top_features = coefficients.head(15)
colors = ['red' if c < 0 else 'blue' for c in top_features['Coefficient']]
plt.barh(top_features['Feature'], top_features['Absolute_Value'], color=colors)
plt.xlabel('Absolute Coefficient Value')
plt.title('Top 15 Most Important Features in LASSO Regression')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('lasso_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()

# Plot predictions vs actual
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Training set
axes[0].scatter(y_train, y_pred_train, alpha=0.6, edgecolors='w', linewidth=0.5)
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()],
             'r--', lw=2)
axes[0].set_xlabel('Actual Values')
axes[0].set_ylabel('Predicted Values')
axes[0].set_title(f'Training Set (R² = {train_r2:.3f})')
axes[0].grid(True, alpha=0.3)

# Test set
axes[1].scatter(y_test, y_pred_test, alpha=0.6, edgecolors='w', linewidth=0.5)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
             'r--', lw=2)
axes[1].set_xlabel('Actual Values')
axes[1].set_ylabel('Predicted Values')
axes[1].set_title(f'Test Set (R² = {test_r2:.3f})')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('lasso_predictions_vs_actual.png', dpi=300, bbox_inches='tight')
plt.close()

# Residual analysis
residuals_train = y_train - y_pred_train
residuals_test = y_test - y_pred_test

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Residuals vs Predicted (Training)
axes[0].scatter(y_pred_train, residuals_train, alpha=0.6, edgecolors='w', linewidth=0.5)
axes[0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[0].set_xlabel('Predicted Values')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residuals vs Predicted (Training)')
axes[0].grid(True, alpha=0.3)

# Residuals vs Predicted (Test)
axes[1].scatter(y_pred_test, residuals_test, alpha=0.6, edgecolors='w', linewidth=0.5)
axes[1].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[1].set_xlabel('Predicted Values')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residuals vs Predicted (Test)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('lasso_residuals_analysis.png', dpi=300, bbox_inches='tight')
plt.close()

# Save model results to file
with open('lasso_regression_results.txt', 'w') as f:
    f.write("="*60 + "\n")
    f.write("LASSO REGRESSION ANALYSIS RESULTS\n")
    f.write("="*60 + "\n\n")

    f.write(f"Dataset: Car Data P.csv\n")
    f.write(f"Target variable: {target}\n")
    f.write(f"Number of features: {len(feature_cols)}\n")
    f.write(f"Training samples: {X_train.shape[0]}\n")
    f.write(f"Test samples: {X_test.shape[0]}\n\n")

    f.write(f"Optimal alpha (regularization strength): {lasso_cv.alpha_:.6f}\n\n")

    f.write("PERFORMANCE METRICS:\n")
    f.write("-"*40 + "\n")
    f.write(f"Training MSE: {train_mse:.4f}\n")
    f.write(f"Test MSE: {test_mse:.4f}\n")
    f.write(f"Training MAE: {train_mae:.4f}\n")
    f.write(f"Test MAE: {test_mae:.4f}\n")
    f.write(f"Training R²: {train_r2:.4f}\n")
    f.write(f"Test R²: {test_r2:.4f}\n\n")

    f.write(f"FEATURE ELIMINATION SUMMARY:\n")
    f.write("-"*40 + "\n")
    f.write(f"Total features: {len(feature_cols)}\n")
    f.write(f"Non-zero coefficients: {non_zero_coeff}\n")
    f.write(f"Zero coefficients (eliminated): {zero_coeff}\n")
    f.write(f"Percentage eliminated: {(zero_coeff/len(feature_cols))*100:.2f}%\n\n")

    f.write("TOP 30 FEATURES BY IMPORTANCE:\n")
    f.write("-"*80 + "\n")
    f.write(f"{'Feature':<30} {'Coefficient':>15} {'Absolute Value':>15}\n")
    f.write("-"*80 + "\n")
    for idx, row in coefficients.head(30).iterrows():
        f.write(f"{row['Feature']:<30} {row['Coefficient']:>15.6f} {row['Absolute_Value']:>15.6f}\n")

    f.write("\n" + "="*60 + "\n")
    f.write("FEATURES ELIMINATED BY LASSO (Zero Coefficients):\n")
    f.write("="*60 + "\n")
    zero_features = coefficients[coefficients['Coefficient'] == 0]
    for idx, row in zero_features.iterrows():
        f.write(f"{row['Feature']}\n")

    f.write(f"\nTotal eliminated features: {len(zero_features)}")

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)
print("Files created:")
print("1. lasso_regression_results.txt - Detailed results and coefficients")
print("2. lasso_feature_importance.png - Feature importance visualization")
print("3. lasso_predictions_vs_actual.png - Prediction vs actual plots")
print("4. lasso_residuals_analysis.png - Residual analysis plots")

# Save the trained model and scaler
import joblib
joblib.dump(lasso_model, 'lasso_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(lasso_cv, 'lasso_cv_model.pkl')

print("\n5. lasso_model.pkl - Trained LASSO model")
print("6. scaler.pkl - Fitted scaler for preprocessing")
print("7. lasso_cv_model.pkl - Cross-validated LASSO model")

# Create a simplified example for prediction
sample_data = pd.DataFrame({
    'Actual': y_test.values[:10],
    'Predicted': y_pred_test[:10],
    'Residual': residuals_test[:10]
})
print("\n" + "="*60)
print("SAMPLE PREDICTIONS (First 10 test samples):")
print("="*60)
print(sample_data.to_string(index=False))

Dataset Shape: (550, 41)

Columns: ['city_mpg', 'class', 'combination_mpg', 'cylinders', 'displacement', 'drive', 'fuel_type', 'highway_mpg', 'make', 'model', 'transmission', 'year', 'mpg_diff', 'engine_efficiency', 'engine_power_density', 'size_category', 'drivetrain_type', 'vehicle_age', 'fuel_efficiency_category', 'is_high_performance', 'is_luxury', 'is_electric', 'is_diesel', 'is_hybrid', 'is_automatic', 'is_manual', 'make_encoded', 'class_encoded', 'drive_encoded', 'fuel_type_encoded', 'transmission_encoded', 'size_category_encoded', 'drivetrain_type_encoded', 'city_mpg_scaled', 'combination_mpg_scaled', 'cylinders_scaled', 'displacement_scaled', 'highway_mpg_scaled', 'year_scaled', 'vehicle_age_scaled', 'engine_efficiency_scaled']

First few rows:
   city_mpg                        class  combination_mpg  cylinders  \
0        25                  midsize car               29        4.0   
1        26                  midsize car               30        4.0   
2        25  small s