## Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

## Read CSV file

In [2]:
data = pd.read_csv('data/laptopData.csv')

## Show the dataframe

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [4]:
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
count,1273.0,1273,1273,1273.0,1273,1273,1273,1273,1273,1273,1273,1273.0
unique,,19,6,25.0,40,118,10,40,110,9,189,
top,,Lenovo,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,2.2kg,
freq,,290,710,640.0,495,183,601,401,271,1047,111,
mean,652.674784,,,,,,,,,,,59955.814073
std,376.493027,,,,,,,,,,,37332.251005
min,0.0,,,,,,,,,,,9270.72
25%,327.0,,,,,,,,,,,31914.72
50%,652.0,,,,,,,,,,,52161.12
75%,980.0,,,,,,,,,,,79333.3872


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1273 non-null   float64
 1   Company           1273 non-null   object 
 2   TypeName          1273 non-null   object 
 3   Inches            1273 non-null   object 
 4   ScreenResolution  1273 non-null   object 
 5   Cpu               1273 non-null   object 
 6   Ram               1273 non-null   object 
 7   Memory            1273 non-null   object 
 8   Gpu               1273 non-null   object 
 9   OpSys             1273 non-null   object 
 10  Weight            1273 non-null   object 
 11  Price             1273 non-null   float64
dtypes: float64(2), object(10)
memory usage: 122.3+ KB


In [6]:
data = data.drop('Unnamed: 0',axis=1)

In [7]:
unique_values = data.nunique()
print("\nNumber of Unique Values:")
print(unique_values)


Number of Unique Values:
Company              19
TypeName              6
Inches               25
ScreenResolution     40
Cpu                 118
Ram                  10
Memory               40
Gpu                 110
OpSys                 9
Weight              189
Price               777
dtype: int64


In [8]:
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
Company             30
TypeName            30
Inches              30
ScreenResolution    30
Cpu                 30
Ram                 30
Memory              30
Gpu                 30
OpSys               30
Weight              30
Price               30
dtype: int64


## Replace '?' value with non value

In [10]:
data['Weight'] = data['Weight'].replace('?', np.nan)

## Build piplines

In [11]:
cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ("scaler",StandardScaler(with_mean=False))
])

In [12]:
categorical_columns= data.select_dtypes(include=['object']).columns

In [13]:
col_trans = ColumnTransformer(transformers=[
    ('cat_pipeline',cat_pipeline,categorical_columns)
    ])

In [14]:
data_1 = data.copy()

## Fit and transform x with the pipline

In [15]:
x = col_trans.fit_transform(data_1.drop('Price',axis=1))



In [16]:
x

array([[0.        , 7.94128451, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 7.94128451, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 3.08035243, ..., 0.        , 0.        ,
        0.        ]])

In [18]:
impute = SimpleImputer(strategy='mean')
y_reshaped = data_1['Price'].values.reshape(-1, 1)
y =impute.fit_transform(y_reshaped)
y.shape

(1303, 1)

In [20]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [21]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list = []

for model_name, model in models.items():
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_r2 = r2_score(y_train, y_train_pred)
    model_test_r2 = r2_score(y_test, y_test_pred)
    
    print(model_name)
    print('Model performance for Training set')
    print("- R2 Score: {:.4f}".format(model_train_r2))
    print('----------------------------------')
    print('Model performance for Test set')
    print("- R2 Score: {:.4f}".format(model_test_r2))
    print('='*35)
    print('\n')
    
    model_list.append(model_name)
    r2_list.append(model_test_r2)


Linear Regression
Model performance for Training set
- R2 Score: 0.9231
----------------------------------
Model performance for Test set
- R2 Score: -4397906183796520682359095296.0000




  model = cd_fast.enet_coordinate_descent(


Lasso
Model performance for Training set
- R2 Score: 0.9601
----------------------------------
Model performance for Test set
- R2 Score: 0.8219


Ridge
Model performance for Training set
- R2 Score: 0.9601
----------------------------------
Model performance for Test set
- R2 Score: 0.8253


K-Neighbors Regressor
Model performance for Training set
- R2 Score: 0.7644
----------------------------------
Model performance for Test set
- R2 Score: 0.5572


Decision Tree
Model performance for Training set
- R2 Score: 0.9996
----------------------------------
Model performance for Test set
- R2 Score: 0.7160




  return fit_method(estimator, *args, **kwargs)


Random Forest Regressor
Model performance for Training set
- R2 Score: 0.9713
----------------------------------
Model performance for Test set
- R2 Score: 0.7514


XGBRegressor
Model performance for Training set
- R2 Score: 0.9787
----------------------------------
Model performance for Test set
- R2 Score: 0.7954


CatBoosting Regressor
Model performance for Training set
- R2 Score: 0.9520
----------------------------------
Model performance for Test set
- R2 Score: 0.8255




  y = column_or_1d(y, warn=True)


AdaBoost Regressor
Model performance for Training set
- R2 Score: 0.4453
----------------------------------
Model performance for Test set
- R2 Score: 0.3884




In [22]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
7,CatBoosting Regressor,0.8255467
2,Ridge,0.8252955
1,Lasso,0.8218677
6,XGBRegressor,0.7953845
5,Random Forest Regressor,0.751373
4,Decision Tree,0.716028
3,K-Neighbors Regressor,0.5571814
8,AdaBoost Regressor,0.3883816
0,Linear Regression,-4.397906e+27


In [23]:
reg_model = Ridge(fit_intercept=True)
reg_model = reg_model.fit(X_train, y_train)
y_pred = reg_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(f" Accuracy of the model is {score} %")

 Accuracy of the model is 82.52954730306661 %


In [24]:
pred_df = pd.DataFrame({'Actual Value': y_test.flatten(), 'Predicted Value': y_pred.flatten(), 'Difference': (y_test - y_pred).flatten()})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
0,89084.160000,88502.770074,581.389926
1,61218.720000,72284.508827,-11065.788827
2,26586.720000,36666.541292,-10079.821292
3,47898.720000,52639.929557,-4741.209557
4,59955.814073,51899.532758,8056.281316
...,...,...,...
256,69210.720000,58553.659549,10657.060451
257,55677.600000,59459.110364,-3781.510364
258,153705.340800,111516.240241,42189.100559
259,103523.040000,129927.792956,-26404.752956
