In [66]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
from sklearn.exceptions import ConvergenceWarning
# Suppress the LinAlgWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

In [2]:
df = pd.read_csv('laptop_price.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


In [4]:
df.nunique()

laptop_ID           1303
Company               19
Product              618
TypeName               6
Inches                18
ScreenResolution      40
Cpu                  118
Ram                    9
Memory                39
Gpu                  110
OpSys                  9
Weight               179
Price_euros          791
dtype: int64

### Check the features that can be working on

In [5]:
df['ScreenResolution'].value_counts()

ScreenResolution
Full HD 1920x1080                                507
1366x768                                         281
IPS Panel Full HD 1920x1080                      230
IPS Panel Full HD / Touchscreen 1920x1080         53
Full HD / Touchscreen 1920x1080                   47
1600x900                                          23
Touchscreen 1366x768                              16
Quad HD+ / Touchscreen 3200x1800                  15
IPS Panel 4K Ultra HD 3840x2160                   12
IPS Panel 4K Ultra HD / Touchscreen 3840x2160     11
4K Ultra HD / Touchscreen 3840x2160               10
4K Ultra HD 3840x2160                              7
Touchscreen 2560x1440                              7
IPS Panel 1366x768                                 7
IPS Panel Quad HD+ / Touchscreen 3200x1800         6
IPS Panel Retina Display 2560x1600                 6
IPS Panel Retina Display 2304x1440                 6
Touchscreen 2256x1504                              6
IPS Panel Touchscreen 2560x14

In [6]:
df['Cpu'].value_counts()

Cpu
Intel Core i5 7200U 2.5GHz       190
Intel Core i7 7700HQ 2.8GHz      146
Intel Core i7 7500U 2.7GHz       134
Intel Core i7 8550U 1.8GHz        73
Intel Core i5 8250U 1.6GHz        72
                                ... 
Intel Core M M3-6Y30 0.9GHz        1
AMD A9-Series 9420 2.9GHz          1
Intel Core i3 6006U 2.2GHz         1
AMD A6-Series 7310 2GHz            1
Intel Xeon E3-1535M v6 3.1GHz      1
Name: count, Length: 118, dtype: int64

In [47]:
df['Weight'].value_counts()

Weight
2.2kg     121
2.1kg      58
2.4kg      44
2.3kg      41
2.5kg      38
         ... 
3.25kg      1
4.7kg       1
1.55kg      1
1.18kg      1
4.0kg       1
Name: count, Length: 179, dtype: int64

In [7]:
df['Ram'].value_counts()

Ram
8GB     619
4GB     375
16GB    200
6GB      41
12GB     25
2GB      22
32GB     17
24GB      3
64GB      1
Name: count, dtype: int64

In [8]:
df['Memory'].value_counts()

Memory
256GB SSD                        412
1TB HDD                          223
500GB HDD                        132
512GB SSD                        118
128GB SSD +  1TB HDD              94
128GB SSD                         76
256GB SSD +  1TB HDD              73
32GB Flash Storage                38
2TB HDD                           16
64GB Flash Storage                15
512GB SSD +  1TB HDD              14
1TB SSD                           14
256GB SSD +  2TB HDD              10
1.0TB Hybrid                       9
256GB Flash Storage                8
16GB Flash Storage                 7
32GB SSD                           6
180GB SSD                          5
128GB Flash Storage                4
512GB SSD +  2TB HDD               3
16GB SSD                           3
512GB Flash Storage                2
1TB SSD +  1TB HDD                 2
256GB SSD +  500GB HDD             2
128GB SSD +  2TB HDD               2
256GB SSD +  256GB SSD             2
512GB SSD +  256GB SSD         

In [9]:
df['Gpu'].value_counts()

Gpu
Intel HD Graphics 620      281
Intel HD Graphics 520      185
Intel UHD Graphics 620      68
Nvidia GeForce GTX 1050     66
Nvidia GeForce GTX 1060     48
                          ... 
AMD Radeon R5 520            1
AMD Radeon R7                1
Intel HD Graphics 540        1
AMD Radeon 540               1
ARM Mali T860 MP4            1
Name: count, Length: 110, dtype: int64

### 1-Ram

#### the extracted feature is Ram 

In [10]:
dic_Ram={'2GB':2,
    '4GB':4,
    '6GB':6,
    '8GB':8,
    '12GB':12,
    '16GB':16,
    '24GB':24,
    '32GB':32,
    '64GB':64}

df['Ram']=df['Ram'].map(dic_Ram)
df['Ram'].value_counts()

Ram
8     619
4     375
16    200
6      41
12     25
2      22
32     17
24      3
64      1
Name: count, dtype: int64

### 2-Gpu

#### The extracted feature is Gpu_manifactoreur

In [11]:
df['Gpu_manifactoreur']=df['Gpu'].str.split(' ').str[0]

In [12]:
df['Gpu_manifactoreur'].nunique()

4

### 3-Cpu 

#### the extracted features are Cpu_manifactoreur and Cpu_frequency_norm

In [13]:
df['Cpu_manifactoreur']=df['Cpu'].str.split(' ').str[0]

In [14]:
df['Cpu_manifactoreur'].nunique()

3

In [15]:
df['Cpu_frequency']=df['Cpu'].str.split(' ').str[-1]

In [16]:
df['Cpu_frequency']

0       2.3GHz
1       1.8GHz
2       2.5GHz
3       2.7GHz
4       3.1GHz
         ...  
1298    2.5GHz
1299    2.5GHz
1300    1.6GHz
1301    2.5GHz
1302    1.6GHz
Name: Cpu_frequency, Length: 1303, dtype: object

In [17]:
df['Cpu_frequency_norm']=df['Cpu_frequency'].str.split('G').str[0].astype(float)

In [18]:
df['Cpu_frequency_norm']

0       2.3
1       1.8
2       2.5
3       2.7
4       3.1
       ... 
1298    2.5
1299    2.5
1300    1.6
1301    2.5
1302    1.6
Name: Cpu_frequency_norm, Length: 1303, dtype: float64

### 4-Memory

In [19]:
df['Memory'].value_counts()

Memory
256GB SSD                        412
1TB HDD                          223
500GB HDD                        132
512GB SSD                        118
128GB SSD +  1TB HDD              94
128GB SSD                         76
256GB SSD +  1TB HDD              73
32GB Flash Storage                38
2TB HDD                           16
64GB Flash Storage                15
512GB SSD +  1TB HDD              14
1TB SSD                           14
256GB SSD +  2TB HDD              10
1.0TB Hybrid                       9
256GB Flash Storage                8
16GB Flash Storage                 7
32GB SSD                           6
180GB SSD                          5
128GB Flash Storage                4
512GB SSD +  2TB HDD               3
16GB SSD                           3
512GB Flash Storage                2
1TB SSD +  1TB HDD                 2
256GB SSD +  500GB HDD             2
128GB SSD +  2TB HDD               2
256GB SSD +  256GB SSD             2
512GB SSD +  256GB SSD         

In [20]:
df['Memory_size']=df['Memory'].str.split(' ').str[2]
df['Memory_size'].value_counts()

Memory_size
+          207
Storage     75
Name: count, dtype: int64

memory has ssd 


In [21]:
df['Memory_has_ssd']=df['Memory'].str.split(' ').str[1]
df['Memory_has_ssd'].value_counts()


Memory_has_ssd
SSD       843
HDD       375
Flash      75
Hybrid     10
Name: count, dtype: int64

In [22]:
dic_memmory_ssd={'SSD':1,
    'HDD':0,
    'Flash':0,
    'Hybrid':1,
    }

df['Memory_has_ssd']=df['Memory_has_ssd'].map(dic_memmory_ssd)
df['Memory_has_ssd'].value_counts()

Memory_has_ssd
1    853
0    450
Name: count, dtype: int64

In [23]:
df['Memory_has_ssd'].head()

0    1
1    0
2    1
3    1
4    1
Name: Memory_has_ssd, dtype: int64

primary storage

In [24]:
df['Memory_size']=df['Memory'].str.split(' ').str[0]
df['Memory_size_bygiga']=df['Memory_size'].str.split('G').str[0]



df['Memory_size_bygiga'].value_counts()

Memory_size_bygiga
256      508
1TB      240
128      177
512      140
500      132
32        45
64        17
2TB       16
1.0TB     10
16        10
180        5
240        1
8          1
508        1
Name: count, dtype: int64

In [25]:
dic_memmory_Size_giga={'1TB':1024,
    '2TB':2048,
    '1.0TB':1024,
    '256':256,
    '128':128,
    '512':512,
    '500':500,
    '32':32,
    '64':64,
    '16':16, 
    '180':180, 
    '240':240, 
    '8':8,
    '508':508   
    }

df['Memory_size_norm']=df['Memory_size_bygiga'].map(dic_memmory_Size_giga)
df['Memory_size_norm'].value_counts()

Memory_size_norm
256     508
1024    250
128     177
512     140
500     132
32       45
64       17
2048     16
16       10
180       5
240       1
8         1
508       1
Name: count, dtype: int64

has additional storage 


In [26]:
df['Memory_has_ssd']=df['Memory'].str.split(' ').str[2]
df['Memory_has_ssd'].value_counts()


Memory_has_ssd
+          207
Storage     75
Name: count, dtype: int64

In [27]:
dic_memmory_additional_memory={'+':1,
                              'Storage':0,
                              }

df['Memory_has_additional_Storage']=df['Memory_has_ssd'].map(dic_memmory_additional_memory)

df['Memory_has_additional_Storage'].value_counts()

Memory_has_additional_Storage
1.0    207
0.0     75
Name: count, dtype: int64

In [28]:
df['Memory_has_additional_Storage'].head()

0    NaN
1    0.0
2    NaN
3    NaN
4    NaN
Name: Memory_has_additional_Storage, dtype: float64

### 5-Weight

In [50]:
df['Weight1']=df['Weight'].str.split('k').str[0].astype(float)
df['Weight1'].value_counts()

Weight1
2.20    126
2.10     58
2.00     45
2.40     44
2.30     41
       ... 
4.50      1
1.14      1
3.80      1
3.25      1
2.34      1
Name: count, Length: 171, dtype: int64

In [51]:
df.head(10)

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,...,Gpu_manifactoreur,Cpu_manifactoreur,Cpu_frequency,Cpu_frequency_norm,Memory_size,Memory_has_ssd,Memory_size_bygiga,Memory_size_norm,Memory_has_additional_Storage,Weight1
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,...,Intel,Intel,2.3GHz,2.3,128GB,,128,128,,1.37
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,...,Intel,Intel,1.8GHz,1.8,128GB,Storage,128,128,0.0,1.34
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,...,Intel,Intel,2.5GHz,2.5,256GB,,256,256,,1.86
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,...,AMD,Intel,2.7GHz,2.7,512GB,,512,512,,1.83
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,...,Intel,Intel,3.1GHz,3.1,256GB,,256,256,,1.37
5,6,Acer,Aspire 3,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4,500GB HDD,AMD Radeon R5,...,AMD,AMD,3GHz,3.0,500GB,,500,500,,2.1
6,7,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16,256GB Flash Storage,Intel Iris Pro Graphics,...,Intel,Intel,2.2GHz,2.2,256GB,Storage,256,256,0.0,2.04
7,8,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,256GB Flash Storage,Intel HD Graphics 6000,...,Intel,Intel,1.8GHz,1.8,256GB,Storage,256,256,0.0,1.34
8,9,Asus,ZenBook UX430UN,Ultrabook,14.0,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,16,512GB SSD,Nvidia GeForce MX150,...,Nvidia,Intel,1.8GHz,1.8,512GB,,512,512,,1.3
9,10,Acer,Swift 3,Ultrabook,14.0,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8,256GB SSD,Intel UHD Graphics 620,...,Intel,Intel,1.6GHz,1.6,256GB,,256,256,,1.6


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 23 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   laptop_ID                      1303 non-null   int64  
 1   Company                        1303 non-null   object 
 2   Product                        1303 non-null   object 
 3   TypeName                       1303 non-null   object 
 4   Inches                         1303 non-null   float64
 5   ScreenResolution               1303 non-null   object 
 6   Cpu                            1303 non-null   object 
 7   Ram                            1303 non-null   int64  
 8   Memory                         1303 non-null   object 
 9   Gpu                            1303 non-null   object 
 10  OpSys                          1303 non-null   object 
 11  Weight                         1303 non-null   object 
 12  Price_euros                    1303 non-null   f

In [53]:
df['Memory_size_norm'].value_counts()

Memory_size_norm
256     508
1024    250
128     177
512     140
500     132
32       45
64       17
2048     16
16       10
180       5
240       1
8         1
508       1
Name: count, dtype: int64

# Get X and Y

In [55]:
x=df.drop(columns=['laptop_ID','Product','Cpu','Memory','Gpu','Price_euros','Cpu_frequency','Memory_size','Memory_has_ssd','Memory_size_bygiga','Memory_has_additional_Storage','ScreenResolution','Weight'])
x.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Gpu_manifactoreur,Cpu_manifactoreur,Cpu_frequency_norm,Memory_size_norm,Weight1
0,Apple,Ultrabook,13.3,8,macOS,Intel,Intel,2.3,128,1.37
1,Apple,Ultrabook,13.3,8,macOS,Intel,Intel,1.8,128,1.34
2,HP,Notebook,15.6,8,No OS,Intel,Intel,2.5,256,1.86
3,Apple,Ultrabook,15.4,16,macOS,AMD,Intel,2.7,512,1.83
4,Apple,Ultrabook,13.3,8,macOS,Intel,Intel,3.1,256,1.37


In [61]:
encoded_x = pd.get_dummies(x, columns=['Company','TypeName', 'OpSys','Gpu_manifactoreur','Cpu_manifactoreur'])
encoded_x.head()

Unnamed: 0,Inches,Ram,Cpu_frequency_norm,Memory_size_norm,Weight1,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,Company_Dell,...,OpSys_Windows 10 S,OpSys_Windows 7,OpSys_macOS,Gpu_manifactoreur_AMD,Gpu_manifactoreur_ARM,Gpu_manifactoreur_Intel,Gpu_manifactoreur_Nvidia,Cpu_manifactoreur_AMD,Cpu_manifactoreur_Intel,Cpu_manifactoreur_Samsung
0,13.3,8,2.3,128,1.37,False,True,False,False,False,...,False,False,True,False,False,True,False,False,True,False
1,13.3,8,1.8,128,1.34,False,True,False,False,False,...,False,False,True,False,False,True,False,False,True,False
2,15.6,8,2.5,256,1.86,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
3,15.4,16,2.7,512,1.83,False,True,False,False,False,...,False,False,True,True,False,False,False,False,True,False
4,13.3,8,3.1,256,1.37,False,True,False,False,False,...,False,False,True,False,False,True,False,False,True,False


In [69]:
encoded_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 46 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Inches                       1303 non-null   float64
 1   Ram                          1303 non-null   int64  
 2   Cpu_frequency_norm           1303 non-null   float64
 3   Memory_size_norm             1303 non-null   int64  
 4   Weight1                      1303 non-null   float64
 5   Company_Acer                 1303 non-null   bool   
 6   Company_Apple                1303 non-null   bool   
 7   Company_Asus                 1303 non-null   bool   
 8   Company_Chuwi                1303 non-null   bool   
 9   Company_Dell                 1303 non-null   bool   
 10  Company_Fujitsu              1303 non-null   bool   
 11  Company_Google               1303 non-null   bool   
 12  Company_HP                   1303 non-null   bool   
 13  Company_Huawei    

In [62]:
y=df['Price_euros']
y.head()

0    1339.69
1     898.94
2     575.00
3    2537.45
4    1803.60
Name: Price_euros, dtype: float64

In [63]:
#Train Test Spliting 
X_train, X_test, y_train, y_test = train_test_split(encoded_x, y, test_size=0.2, random_state=42)

Applying Linear regression

In [64]:
# Create a Linear Regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

In [67]:
y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_pred, y_test))
print("MSE:", mean_squared_error(y_pred, y_test))
print("r2 score:", r2_score(y_pred, y_test))

MAE: 260.57562265122453
MSE: 145826.25989947107
r2 score: 0.5785280443923413


Applying randomforest regressor

In [73]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [74]:
CV = []
train_scores = []
test_scores = []

def pred_model(model2):
    # Training model
    model2.fit(X_train,y_train)
            
    y_pred_train=model2.predict(X_train)
    
    # score of test set
    y_pred_test=model2.predict(X_test)
    
    # train set using Cross validation
    cross_val = cross_val_score(model2 ,X_train ,y_train ,cv=5)
    cv_mean = cross_val.mean()
    
    # Printing results
    print("Train CV scores :",cross_val)
    print("Train CV mean :",round(cv_mean,2))
    print("Mean r2 score for train :",r2_score(y_train,y_pred_train))
    print("Mean r2 score for test :",r2_score(y_test,y_pred_test))

In [75]:
rf = RandomForestRegressor()
pred_model(rf)

Train CV scores : [0.82732081 0.84530599 0.79917284 0.79650727 0.75297082]
Train CV mean : 0.8
Mean r2 score for train : 0.9727321508406614
Mean r2 score for test : 0.8089630265273214


In [76]:
dic={'max_depth': [20, 50, 100],
    'max_features': ['sqrt', 'log2'],
    'n_estimators': [20, 50, 100]}

# Creating GridSearchCV to find the best estimator of hyperparameter
GSRF = GridSearchCV(estimator = rf, param_grid=dic)
pred_model(GSRF)
print("best: parameters:", GSRF.best_params_)

Train CV scores : [0.83619224 0.86739361 0.81015475 0.79748812 0.80521067]
Train CV mean : 0.82
Mean r2 score for train : 0.9710867715426187
Mean r2 score for test : 0.8112064279140121
best: parameters: {'max_depth': 50, 'max_features': 'sqrt', 'n_estimators': 50}
