In [1]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df_train = pd.read_csv("data/final_train.csv")

df_train.head()

Unnamed: 0,manufacturer_Acer,manufacturer_Apple,manufacturer_Asus,manufacturer_Chuwi,manufacturer_Dell,manufacturer_Fujitsu,manufacturer_Google,manufacturer_HP,manufacturer_Huawei,manufacturer_LG,...,gpu_provider_Nvidia,screen_size,total_pixels,ram,ssd,hdd,hybrid,clock_speed,weight_kg,price
0,0,1,0,0,0,0,0,0,0,0,...,0,13.3,4096000.0,8.0,128.0,0.0,0.0,2.3,1.37,11912523.48
1,0,1,0,0,0,0,0,0,0,0,...,0,13.3,1296000.0,8.0,0.0,0.0,128.0,1.8,1.34,7993374.48
2,0,0,0,0,0,0,0,1,0,0,...,0,15.6,2073600.0,8.0,256.0,0.0,0.0,2.5,1.86,5112900.0
3,0,1,0,0,0,0,0,0,0,0,...,0,15.4,5184000.0,16.0,512.0,0.0,0.0,2.7,1.83,22563005.4
4,0,1,0,0,0,0,0,0,0,0,...,0,13.3,4096000.0,8.0,256.0,0.0,0.0,3.1,1.37,16037611.2


In [3]:
df_test = pd.read_csv("data/final_test.csv")

df_test.head()

Unnamed: 0,manufacturer_Acer,manufacturer_Apple,manufacturer_Asus,manufacturer_Chuwi,manufacturer_Dell,manufacturer_Fujitsu,manufacturer_Google,manufacturer_HP,manufacturer_Huawei,manufacturer_LG,...,gpu_provider_Nvidia,screen_size,total_pixels,ram,ssd,hdd,hybrid,clock_speed,weight_kg,price
0,0,0,0,0,0,0,0,1,0,0,...,0,15.6,1049088.0,6.0,0.0,1024.0,0.0,2.7,2.04,5148468.0
1,0,0,1,0,0,0,0,0,0,0,...,1,17.3,2073600.0,16.0,256.0,1024.0,0.0,2.8,2.99,15552108.0
2,0,0,0,0,1,0,0,0,0,0,...,0,15.6,2073600.0,12.0,512.0,0.0,0.0,2.7,2.19,11550708.0
3,0,0,0,0,0,0,0,0,0,0,...,0,13.3,2073600.0,4.0,128.0,0.0,0.0,2.3,1.2,10625940.0
4,0,0,0,0,0,0,0,0,0,0,...,0,15.6,2073600.0,6.0,256.0,0.0,0.0,3.6,2.2,4881708.0


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

class LaptopPricePredictionPipeline:
    def __init__(self, df, test_df=None):
        self.df = df.copy()
        self.test_df = test_df.copy() if test_df is not None else None
        self.models = {}
        self.best_models = {}
        self.results = {}
        self.X_train = None
        self.X_val = None
        self.y_train = None
        self.y_val = None
        self.scaler = None
        
    def prepare_data(self):
        """Prepare features and target, apply scaling and create stratified split"""
        print("=== Data Preparation ===")
        
        # Separate features and target
        X = self.df.drop('price', axis=1)
        y = self.df['price']
        
        print(f"Dataset shape: {X.shape}")
        print(f"Target range: ${y.min():,.2f} - ${y.max():,.2f}")
        
        # Check for missing values
        missing_info = X.isnull().sum()
        if missing_info.sum() > 0:
            print("\n⚠️  Missing values found:")
            print(missing_info[missing_info > 0])
        else:
            print("✓ No missing values in features")
            
        # Check for missing values in target
        if y.isnull().sum() > 0:
            print(f"⚠️  Missing values in target: {y.isnull().sum()}")
            # Remove rows with missing target values
            mask = ~y.isnull()
            X = X[mask]
            y = y[mask]
            print(f"✓ Removed rows with missing target. New shape: {X.shape}")
        
        # Create price bins for stratified sampling
        y_bins = pd.qcut(y, q=5, labels=False, duplicates='drop')
        
        # Split the data (80-20)
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y_bins
        )
        
        print(f"Training set: {self.X_train.shape}")
        print(f"Validation set: {self.X_val.shape}")
        print("✓ Data preparation completed")
        
    def define_models(self):
        """Define all models with their parameter grids"""
        
        # Model definitions with hyperparameter grids
        self.models = {
            'Linear_Regression': {
                'model': Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', RobustScaler()),
                    ('regressor', LinearRegression())
                ]),
                'params': {}
            },
            
            'Ridge_Regression': {
                'model': Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', RobustScaler()),
                    ('regressor', Ridge(random_state=42))
                ]),
                'params': {
                    'regressor__alpha': [0.1, 1.0, 10.0, 100.0, 1000.0],
                    'regressor__solver': ['auto', 'svd', 'saga']
                }
            },
            
            'Lasso_Regression': {
                'model': Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', RobustScaler()),
                    ('regressor', Lasso(random_state=42, max_iter=2000))
                ]),
                'params': {
                    'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
                    'regressor__selection': ['cyclic', 'random']
                }
            },
            
            'ElasticNet_Regression': {
                'model': Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', RobustScaler()),
                    ('regressor', ElasticNet(random_state=42, max_iter=2000))
                ]),
                'params': {
                    'regressor__alpha': [0.01, 0.1, 1.0, 10.0],
                    'regressor__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
                }
            },
            
            'Random_Forest': {
                'model': Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
                ]),
                'params': {
                    'regressor__n_estimators': [100, 200, 300],
                    'regressor__max_depth': [10, 20, None],
                    'regressor__min_samples_split': [2, 5, 10],
                    'regressor__min_samples_leaf': [1, 2, 4],
                    'regressor__max_features': ['sqrt', 'log2']
                }
            },
            
            'Gradient_Boosting': {
                'model': Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('regressor', GradientBoostingRegressor(random_state=42))
                ]),
                'params': {
                    'regressor__n_estimators': [100, 200, 300],
                    'regressor__max_depth': [3, 5, 7],
                    'regressor__learning_rate': [0.01, 0.1, 0.2],
                    'regressor__subsample': [0.8, 0.9, 1.0],
                    'regressor__min_samples_split': [2, 5, 10]
                }
            },
            
            'XGBoost': {
                'model': Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('regressor', xgb.XGBRegressor(random_state=42, n_jobs=-1))
                ]),
                'params': {
                    'regressor__n_estimators': [100, 200, 300],
                    'regressor__max_depth': [3, 5, 7],
                    'regressor__learning_rate': [0.01, 0.1, 0.2],
                    'regressor__subsample': [0.8, 0.9, 1.0],
                    'regressor__colsample_bytree': [0.8, 0.9, 1.0],
                    'regressor__reg_alpha': [0, 0.1, 1],
                    'regressor__reg_lambda': [1, 1.5, 2]
                }
            },
            
            'LightGBM': {
                'model': Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('regressor', lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1))
                ]),
                'params': {
                    'regressor__n_estimators': [100, 200, 300],
                    'regressor__max_depth': [3, 5, 7],
                    'regressor__learning_rate': [0.01, 0.1, 0.2],
                    'regressor__subsample': [0.8, 0.9, 1.0],
                    'regressor__colsample_bytree': [0.8, 0.9, 1.0],
                    'regressor__reg_alpha': [0, 0.1, 1],
                    'regressor__reg_lambda': [1, 1.5, 2]
                }
            }
        }
        
    def train_models(self):
        """Train all models with hyperparameter tuning using GridSearchCV"""
        print("\n=== Model Training with Hyperparameter Tuning ===")
        
        for name, model_config in self.models.items():
            print(f"\nTraining {name}...")
            
            try:
                if model_config['params']:
                    # Use GridSearchCV for hyperparameter tuning
                    grid_search = GridSearchCV(
                        model_config['model'], 
                        model_config['params'],
                        cv=5,
                        scoring='neg_mean_squared_error',
                        n_jobs=-1,
                        verbose=0
                    )
                    grid_search.fit(self.X_train, self.y_train)
                    self.best_models[name] = grid_search.best_estimator_
                    print(f"✓ Best params for {name}: {grid_search.best_params_}")
                else:
                    # For models without hyperparameters (Linear Regression)
                    model_config['model'].fit(self.X_train, self.y_train)
                    self.best_models[name] = model_config['model']
                    print(f"✓ {name} trained successfully")
                    
            except Exception as e:
                print(f"✗ Error training {name}: {str(e)}")
                
        print(f"\n✓ Successfully trained {len(self.best_models)} models")
        
    def evaluate_models(self):
        """Evaluate all trained models"""
        print("\n=== Model Evaluation ===")
        self.results = {}
        
        for name, model in self.best_models.items():
            try:
                # Make predictions
                y_train_pred = model.predict(self.X_train)
                y_val_pred = model.predict(self.X_val)
                
                # Calculate metrics
                train_metrics = self._calculate_metrics(self.y_train, y_train_pred)
                val_metrics = self._calculate_metrics(self.y_val, y_val_pred)
                
                # Cross-validation score
                cv_scores = cross_val_score(model, self.X_train, self.y_train, 
                                          cv=5, scoring='neg_mean_squared_error')
                cv_rmse = np.sqrt(-cv_scores.mean())
                
                self.results[name] = {
                    'train_metrics': train_metrics,
                    'val_metrics': val_metrics,
                    'cv_rmse': cv_rmse,
                    'cv_std': np.sqrt(-cv_scores).std()
                }
                
            except Exception as e:
                print(f"✗ Error evaluating {name}: {str(e)}")
        
        self._display_results()
        
    def _calculate_metrics(self, y_true, y_pred):
        """Calculate regression metrics"""
        return {
            'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
            'MAE': mean_absolute_error(y_true, y_pred),
            'R2': r2_score(y_true, y_pred),
            'MAPE': np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        }
    
    def _display_results(self):
        """Display evaluation results in a formatted table"""
        print("\n" + "="*100)
        print(f"{'Model':<20} {'Train R²':<10} {'Val R²':<10} {'Train RMSE':<12} {'Val RMSE':<12} {'CV RMSE':<12} {'MAPE(%)':<10}")
        print("="*100)
        
        sorted_models = sorted(self.results.items(), 
                             key=lambda x: x[1]['val_metrics']['R2'], reverse=True)
        
        for name, metrics in sorted_models:
            train_r2 = metrics['train_metrics']['R2']
            val_r2 = metrics['val_metrics']['R2']
            train_rmse = metrics['train_metrics']['RMSE']
            val_rmse = metrics['val_metrics']['RMSE']
            cv_rmse = metrics['cv_rmse']
            mape = metrics['val_metrics']['MAPE']
            
            print(f"{name:<20} {train_r2:<10.4f} {val_r2:<10.4f} {train_rmse:<12.0f} "
                  f"{val_rmse:<12.0f} {cv_rmse:<12.0f} {mape:<10.2f}")
        
        print("="*100)
        
        # Best model
        best_model_name = sorted_models[0][0]
        print(f"\n🏆 Best Model: {best_model_name}")
        print(f"Validation R²: {sorted_models[0][1]['val_metrics']['R2']:.4f}")
        print(f"Validation RMSE: ${sorted_models[0][1]['val_metrics']['RMSE']:,.2f}")
        
    def predict_test_data(self, model_name=None):
        """Make predictions on test data"""
        if self.test_df is None:
            print("No test data provided")
            return None
            
        if model_name is None:
            # Use best model
            model_name = max(self.results.keys(), 
                           key=lambda x: self.results[x]['val_metrics']['R2'])
        
        if model_name not in self.best_models:
            print(f"Model {model_name} not found")
            return None
            
        # Make predictions (pipeline handles imputation and scaling automatically)
        predictions = self.best_models[model_name].predict(self.test_df)
        
        print(f"\n=== Test Predictions using {model_name} ===")
        print(f"Predictions range: ${predictions.min():,.2f} - ${predictions.max():,.2f}")
        print(f"Mean prediction: ${predictions.mean():,.2f}")
        
        return predictions
    
    def get_feature_importance(self, model_name=None):
        """Get feature importance for tree-based models"""
        if model_name is None:
            model_name = max(self.results.keys(), 
                           key=lambda x: self.results[x]['val_metrics']['R2'])
        
        model = self.best_models[model_name]
        
        # Extract the actual regressor from pipeline
        regressor = model.named_steps['regressor']
        
        if hasattr(regressor, 'feature_importances_'):
            importances = regressor.feature_importances_
            feature_names = self.X_train.columns
            
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importances
            }).sort_values('importance', ascending=False)
            
            print(f"\n=== Feature Importance ({model_name}) ===")
            print(importance_df.head(10).to_string(index=False))
            return importance_df
        else:
            print(f"Feature importance not available for {model_name}")
            return None
    
    def run_pipeline(self):
        """Run the complete ML pipeline"""
        print("🚀 Starting Laptop Price Prediction ML Pipeline")
        print("-" * 60)
        
        self.prepare_data()
        self.define_models()
        self.train_models()
        self.evaluate_models()
        
        print("\n✅ Pipeline completed successfully!")
        return self.results

# Usage Example:
"""
# Load your data
df = pd.read_csv('your_laptop_data.csv')
test_df = pd.read_csv('your_test_data.csv')  # Optional

# Initialize and run pipeline
pipeline = LaptopPricePredictionPipeline(df, test_df)
results = pipeline.run_pipeline()

# Get feature importance
pipeline.get_feature_importance()

# Make predictions on test data
predictions = pipeline.predict_test_data()

# Use specific model for predictions
predictions = pipeline.predict_test_data('Random_Forest')
"""

"\n# Load your data\ndf = pd.read_csv('your_laptop_data.csv')\ntest_df = pd.read_csv('your_test_data.csv')  # Optional\n\n# Initialize and run pipeline\npipeline = LaptopPricePredictionPipeline(df, test_df)\nresults = pipeline.run_pipeline()\n\n# Get feature importance\npipeline.get_feature_importance()\n\n# Make predictions on test data\npredictions = pipeline.predict_test_data()\n\n# Use specific model for predictions\npredictions = pipeline.predict_test_data('Random_Forest')\n"

In [5]:
pipeline = LaptopPricePredictionPipeline(df_train, df_test)
results = pipeline.run_pipeline()

🚀 Starting Laptop Price Prediction ML Pipeline
------------------------------------------------------------
=== Data Preparation ===
Dataset shape: (977, 44)
Target range: $1,706,374.80 - $54,232,308.00

⚠️  Missing values found:
weight_kg    44
dtype: int64
Training set: (781, 44)
Validation set: (196, 44)
✓ Data preparation completed

=== Model Training with Hyperparameter Tuning ===

Training Linear_Regression...
✓ Linear_Regression trained successfully

Training Ridge_Regression...
✓ Best params for Ridge_Regression: {'regressor__alpha': 1.0, 'regressor__solver': 'svd'}

Training Lasso_Regression...
✓ Best params for Lasso_Regression: {'regressor__alpha': 0.01, 'regressor__selection': 'cyclic'}

Training ElasticNet_Regression...
✓ Best params for ElasticNet_Regression: {'regressor__alpha': 0.01, 'regressor__l1_ratio': 0.9}

Training Random_Forest...


KeyboardInterrupt: 

In [None]:
pipeline.get_feature_importance()

In [None]:
predictions = pipeline.predict_test_data()

In [None]:
predictions = pipeline.predict_test_data('Random_Forest')