In [10]:
import numpy as np
import pandas as pd
import datacmp
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib

In [11]:
df = pd.read_csv('cleaned_laptop_data.csv')
df.head()

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,IPS,ppi,CpuBrand,SSD,HDD,GpuBrand,os
0,Apple,Ultrabook,8,1.37,856.544198,0,1,226.983005,Intel Core i5,128,0,Intel,macOS
1,Apple,Ultrabook,8,1.34,574.746278,0,0,127.67794,Intel Core i5,0,0,Intel,macOS
2,HP,Notebook,8,1.86,367.632,0,0,141.211998,Intel Core i5,256,0,Intel,No OS
3,Apple,Ultrabook,16,1.83,1622.344032,0,1,220.534624,Intel Core i7,512,0,AMD,macOS
4,Apple,Ultrabook,8,1.37,1153.149696,0,1,226.983005,Intel Core i5,256,0,Intel,macOS


In [12]:
print(datacmp.get_detailed(df))

╭────────────────┬──────────╮
│ Info           │   Values │
├────────────────┼──────────┤
│ Number of Rows │     1302 │
│ Number of Rows │       13 │
╰────────────────┴──────────╯

╭───────────────┬─────────┬────────┬────────────┬────────┬──────────────╮
│ Column Name   │ Dtype   │   Null │   Not Null │ Mean   │   Unique Val │
├───────────────┼─────────┼────────┼────────────┼────────┼──────────────┤
│ Company       │ object  │      0 │       1302 │ -      │           19 │
│ TypeName      │ object  │      0 │       1302 │ -      │            6 │
│ Ram           │ int64   │      0 │       1302 │ 8.39   │            9 │
│ Weight        │ float64 │      0 │       1302 │ 2.04   │          171 │
│ Price         │ float64 │      0 │       1302 │ 718.67 │          791 │
│ TouchScreen   │ int64   │      0 │       1302 │ 0.15   │            2 │
│ IPS           │ int64   │      0 │       1302 │ 0.28   │            2 │
│ ppi           │ float64 │      0 │       1302 │ 146.57 │           39 │
│ Cpu

In [13]:
X = df.drop(columns=['Price'])
y = np.log(df['Price'])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [15]:
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(exclude='object').columns.tolist()

In [16]:
preprocessor = ColumnTransformer(
    [
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('scale', StandardScaler(), numerical_cols)
    ]
)

In [17]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNN': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'Extra Trees': ExtraTreesRegressor(),
    'SVR': SVR(),
    'XGBoost': XGBRegressor()
}

In [18]:
results = []

for name, model in models.items():
    pipe = Pipeline(
        [
            ('preprocessing', preprocessor),
            ('regressor', model)
        ]
    )
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    results.append(
        {
            'Model': name,
            'R2 Score': r2,
            'MAE': mae,
            'RMSE': rmse
        }
    )

In [19]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='R2 Score', ascending=False).reset_index(drop=True)
print(results_df)

                Model  R2 Score       MAE      RMSE
0                 SVR  0.869619  0.161823  0.215333
1   Gradient Boosting  0.866289  0.169706  0.218066
2       Random Forest  0.858915  0.167180  0.223998
3             XGBoost  0.843280  0.163305  0.236084
4                 KNN  0.842075  0.176776  0.236990
5         Extra Trees  0.838304  0.172347  0.239802
6               Ridge  0.804242  0.201909  0.263854
7   Linear Regression  0.800634  0.203649  0.266274
8            AdaBoost  0.798471  0.216411  0.267715
9       Decision Tree  0.770134  0.207237  0.285918
10              Lasso -0.000876  0.481091  0.596614


In [20]:
best_model = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', SVR())
])

best_model.fit(X, y)

joblib.dump(best_model, 'best_laptop_price_model.pkl')

print("Model saved as 'best_laptop_price_model.pkl'")

Model saved as 'best_laptop_price_model.pkl'
