## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_csv("cleaned.csv")

In [3]:
data.head()

Unnamed: 0,Brand,Product_Description,Screen_Size,RAM,Processor,GPU,GPU_Type,Resolution,Condition,Price
0,Lenovo,Lenovo ThinkPad 14” HD Laptop PC Computer Core...,14.0,16,Intel Core i5 7th Gen.,Intel HD Graphics 520,Integrated/On-Board Graphics,,Very Good - Refurbished,189.99
1,Dell,"Dell Latitude 15.6"" Laptop Intel Core i5 64GB ...",15.6,64,Intel Core i5 8th Gen.,Intel UHD Graphics 620,Integrated/On-Board Graphics,1920 x 1080,Very Good - Refurbished,349.99
2,HP,"NEW HP 15 Laptop. 15.6"" 1080p, i5-1135G7, 8GB ...",15.6,8,Intel Core i5 11th Gen.,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080,New,369.95
3,Lenovo,2024 Lenovo Ideapad Laptop 15 FHD Touch Intel ...,15.6,Up,Intel Core i5 11th Gen.,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080,New,459.0
4,Lenovo,"2024 Lenovo Ideapad Laptop 15.6"" FHD Touch Int...",15.6,16,Intel Core i5 11th Gen.,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080,New,499.0


In [4]:
data.shape

(2952, 10)

## Taking care of duplicate observations if present over here

In [5]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [6]:
data.isnull().sum()

Brand                    0
Product_Description      0
Screen_Size              0
RAM                      0
Processor                0
GPU                    348
GPU_Type               234
Resolution             387
Condition                0
Price                    0
dtype: int64

In [7]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

GPU
GPU_Type
Resolution


In [8]:
data[missing_values]

Unnamed: 0,GPU,GPU_Type,Resolution
0,Intel HD Graphics 520,Integrated/On-Board Graphics,
1,Intel UHD Graphics 620,Integrated/On-Board Graphics,1920 x 1080
2,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080
3,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080
4,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080
...,...,...,...
2947,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080
2948,NVIDIA GeForce RTX 3050,Dedicated Graphics,1920 x 1080
2949,Intel HD Graphics,Integrated/On-Board Graphics,1920 x 1200
2950,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080


## Dropping all the missing values over here

In [9]:
data.dropna(inplace=True)

In [10]:
data.isnull().sum()

Brand                  0
Product_Description    0
Screen_Size            0
RAM                    0
Processor              0
GPU                    0
GPU_Type               0
Resolution             0
Condition              0
Price                  0
dtype: int64

## Filtering all the numerical features over here

In [11]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

Screen_Size
Price


In [12]:
data[numerical_features]

Unnamed: 0,Screen_Size,Price
1,15.6,349.99
2,15.6,369.95
3,15.6,459.00
4,15.6,499.00
5,13.5,349.00
...,...,...
2947,13.3,789.99
2948,15.6,895.55
2949,15.6,635.65
2950,15.6,405.99


## Filtering all the categorical features over here

In [13]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Brand
Product_Description
RAM
Processor
GPU
GPU_Type
Resolution
Condition


In [14]:
data[cat_features]

Unnamed: 0,Brand,Product_Description,RAM,Processor,GPU,GPU_Type,Resolution,Condition
1,Dell,"Dell Latitude 15.6"" Laptop Intel Core i5 64GB ...",64,Intel Core i5 8th Gen.,Intel UHD Graphics 620,Integrated/On-Board Graphics,1920 x 1080,Very Good - Refurbished
2,HP,"NEW HP 15 Laptop. 15.6"" 1080p, i5-1135G7, 8GB ...",8,Intel Core i5 11th Gen.,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080,New
3,Lenovo,2024 Lenovo Ideapad Laptop 15 FHD Touch Intel ...,Up,Intel Core i5 11th Gen.,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080,New
4,Lenovo,"2024 Lenovo Ideapad Laptop 15.6"" FHD Touch Int...",16,Intel Core i5 11th Gen.,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080,New
5,Microsoft,"Microsoft Surface Laptop 5 13.5"" - 512GB SSD, ...",8,Intel Core i5 12th Gen.,Intel Iris Xe Graphics,Integrated/On-Board Graphics,2256 x 1504,Open box
...,...,...,...,...,...,...,...,...
2947,Dell,Dell Latitude 7320 i5-1145G7 8GB 256GB BT5 WiF...,8,Intel Core i5 11th Gen.,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080,Open box
2948,MSI,NEW MSI COMPUTER Sword 15 A12UC-295 Sword15122...,8,Intel Core i5 12th Gen.,NVIDIA GeForce RTX 3050,Dedicated Graphics,1920 x 1080,New
2949,Dell,Dell Precision 5550 Workstation 15 15.6 Laptop...,8,Intel Core i5 10th Gen.,Intel HD Graphics,Integrated/On-Board Graphics,1920 x 1200,Excellent - Refurbished
2950,HP,"HP 15 Laptop 15.6"" FHD i5-1135G7 8GB RAM 256GB...",8,Intel Core i5 11th Gen.,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080,New


## Encoding the categorical features into the numerical features over here

In [15]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [16]:
data

Unnamed: 0,Brand,Product_Description,Screen_Size,RAM,Processor,GPU,GPU_Type,Resolution,Condition,Price
1,0,0,15.6,0,0,0,0,0,0,349.99
2,1,1,15.6,1,1,1,0,0,1,369.95
3,2,2,15.6,2,1,1,0,0,1,459.00
4,2,3,15.6,3,1,1,0,0,1,499.00
5,3,4,13.5,1,2,1,0,1,2,349.00
...,...,...,...,...,...,...,...,...,...,...
2947,0,2272,13.3,1,1,1,0,0,2,789.99
2948,8,2273,15.6,1,2,4,1,0,1,895.55
2949,0,2274,15.6,1,11,7,0,2,3,635.65
2950,1,2275,15.6,1,1,1,0,0,1,405.99


## Creating the features and labels over here to apply supervised machine learning over here

In [17]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [19]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

## Training the model on the training set over here

In [20]:
!pip install catboost



In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

regressors = {
    "Random Forest": RandomForestRegressor(),
    "Support Vector Machine": SVR(kernel='linear'),
    "Linear Regression": LinearRegression(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Bayesian Ridge": BayesianRidge(),
    "XGBoost": XGBRegressor(),
    "CatBoost": CatBoostRegressor(),
    "Ridge": Ridge(),
    "Lasso": Lasso()
}

results = {}
for name, regressor in regressors.items():
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse

best_regressor = min(results, key=results.get)
best_mse = results[best_regressor]

print("Best Regressor:", best_regressor)
print("Mean Squared Error:", best_mse)

Learning rate set to 0.04501
0:	learn: 229.7126391	total: 52.5ms	remaining: 52.5s
1:	learn: 227.0249441	total: 55.2ms	remaining: 27.5s
2:	learn: 224.4561774	total: 59.6ms	remaining: 19.8s
3:	learn: 222.2329495	total: 64.3ms	remaining: 16s
4:	learn: 220.0160031	total: 66.8ms	remaining: 13.3s
5:	learn: 218.0736694	total: 70.5ms	remaining: 11.7s
6:	learn: 216.1672924	total: 73.5ms	remaining: 10.4s
7:	learn: 213.8385028	total: 77.7ms	remaining: 9.64s
8:	learn: 211.7555882	total: 79.4ms	remaining: 8.75s
9:	learn: 210.2206135	total: 81.4ms	remaining: 8.05s
10:	learn: 208.7175371	total: 85.7ms	remaining: 7.71s
11:	learn: 207.5473559	total: 87.6ms	remaining: 7.21s
12:	learn: 205.8817971	total: 90.3ms	remaining: 6.85s
13:	learn: 204.3759090	total: 94.3ms	remaining: 6.64s
14:	learn: 203.2459349	total: 96.1ms	remaining: 6.31s
15:	learn: 201.7533160	total: 99.1ms	remaining: 6.1s
16:	learn: 200.5755282	total: 101ms	remaining: 5.83s
17:	learn: 199.3666644	total: 104ms	remaining: 5.68s
18:	learn: 198

In [22]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

regressor=CatBoostRegressor()
regressor.fit(X_train,y_train)

Learning rate set to 0.04501
0:	learn: 229.7126391	total: 1.41ms	remaining: 1.41s
1:	learn: 227.0249441	total: 2.91ms	remaining: 1.45s
2:	learn: 224.4561774	total: 4.9ms	remaining: 1.63s
3:	learn: 222.2329495	total: 7.82ms	remaining: 1.95s
4:	learn: 220.0160031	total: 10.3ms	remaining: 2.06s
5:	learn: 218.0736694	total: 12.4ms	remaining: 2.06s
6:	learn: 216.1672924	total: 19.5ms	remaining: 2.77s
7:	learn: 213.8385028	total: 21.6ms	remaining: 2.67s
8:	learn: 211.7555882	total: 30.1ms	remaining: 3.32s
9:	learn: 210.2206135	total: 32.2ms	remaining: 3.19s
10:	learn: 208.7175371	total: 40.4ms	remaining: 3.63s
11:	learn: 207.5473559	total: 42.4ms	remaining: 3.49s
12:	learn: 205.8817971	total: 50.6ms	remaining: 3.84s
13:	learn: 204.3759090	total: 52.6ms	remaining: 3.7s
14:	learn: 203.2459349	total: 59.9ms	remaining: 3.93s
15:	learn: 201.7533160	total: 61.5ms	remaining: 3.78s
16:	learn: 200.5755282	total: 62.9ms	remaining: 3.63s
17:	learn: 199.3666644	total: 66.1ms	remaining: 3.61s
18:	learn: 

<catboost.core.CatBoostRegressor at 0x7996acbf7eb0>

## Evaluating the performance of the model on the testing dataset over here

In [23]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[781.4  666.  ]
 [344.88 379.95]
 [512.58 459.99]
 [634.31 999.99]
 [598.79 525.  ]
 [323.5  279.99]
 [648.95 700.  ]
 [585.56 849.95]
 [377.8  260.  ]
 [640.97 589.  ]
 [702.77 674.99]
 [682.75 949.  ]
 [619.45 929.99]
 [411.33 499.  ]
 [606.71 880.  ]
 [480.17 479.99]
 [481.57 349.  ]
 [723.47 919.99]
 [645.39 612.99]
 [332.   386.96]
 [354.96 349.99]
 [558.25 839.  ]
 [760.73 899.99]
 [781.25 843.61]
 [716.62 825.  ]
 [705.8  531.15]
 [416.53 589.99]
 [579.99 589.  ]
 [387.1  349.  ]
 [559.7  349.75]
 [456.48 240.  ]
 [448.17 599.99]
 [356.2  332.1 ]
 [341.06 319.99]
 [255.88 269.99]
 [596.68 599.99]
 [423.96 558.83]
 [877.88 999.99]
 [416.11 650.  ]
 [443.25 559.99]
 [470.17 441.99]
 [469.26 299.  ]
 [470.78 405.  ]
 [229.91 299.95]
 [615.46 805.98]
 [270.64 289.  ]
 [305.32 259.95]
 [421.55 369.99]
 [402.66 399.  ]
 [557.77 800.  ]
 [794.61 955.  ]
 [322.17 265.  ]
 [515.59 459.  ]
 [374.34 930.13]
 [625.84 569.99]
 [828.45 999.99]
 [652.12 695.  ]
 [633.7  816.33]
 [700.04 900. 

In [25]:
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(actual_vs_predicted)

     Actual   Predicted
0    666.00  781.398534
1    379.95  344.878041
2    459.99  512.579540
3    999.99  634.311016
4    525.00  598.785550
..      ...         ...
451  424.15  376.489768
452  290.00  784.339519
453  800.00  700.644309
454  242.10  214.643020
455  514.41  475.067841

[456 rows x 2 columns]


In [26]:
actual_vs_predicted['Absolute Difference'] = abs(actual_vs_predicted['Actual'] - actual_vs_predicted['Predicted'])

In [29]:
actual_vs_predicted[actual_vs_predicted['Absolute Difference']<20]

Unnamed: 0,Actual,Predicted,Absolute Difference
15,479.99,480.171178,0.181178
20,349.99,354.956898,4.966898
27,589.0,579.990205,9.009795
34,269.99,255.878793,14.111207
35,599.99,596.680767,3.309233
45,289.0,270.642772,18.357228
48,399.0,402.659159,3.659159
62,620.0,616.861102,3.138898
74,233.95,246.309927,12.359927
81,390.0,402.376646,12.376646
