In [47]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv('laptops.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')

laptop             0
status             0
brand              0
model              0
cpu                0
ram                0
storage            0
storage_type      42
gpu             1371
screen             4
touch              0
final_price        0
dtype: int64

In [52]:
df['status'].unique()

array(['New', 'Refurbished'], dtype=object)

In [70]:
categorical_cols = ['status', 'brand', 'model', 'cpu', 'storage_type', 'gpu', 'touch']
df = pd.get_dummies(df, columns=categorical_cols)
for col in df.columns:
    if col =='laptop':
        break
    else:
        df[col].fillna(df[col].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [71]:
df_train.columns

Index(['laptop', 'ram', 'storage', 'screen', 'final_price', 'status_0',
       'status_1', 'brand_Acer', 'brand_Alurin', 'brand_Apple',
       ...
       'gpu_RX 6800S', 'gpu_RX 7600S', 'gpu_Radeon Pro 5300M',
       'gpu_Radeon Pro 5500M', 'gpu_Radeon Pro RX 560X', 'gpu_T 1200',
       'gpu_T 500', 'gpu_T 550', 'touch_No', 'touch_Yes'],
      dtype='object', length=206)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Columns: 230 entries, ram to touch_Yes
dtypes: bool(226), float64(2), int64(2)
memory usage: 544.3 KB


In [None]:
X_train = df_train.drop('final_price',axis=1)
y_train = df_train['final_price']
model.fit(X_train, y_train)

In [5]:
df.describe()

Unnamed: 0,ram,storage,screen,final_price
count,2160.0,2160.0,2156.0,2160.0
mean,15.413889,596.294444,15.168112,1312.638509
std,9.867815,361.220506,1.203329,911.475417
min,4.0,0.0,10.1,201.05
25%,8.0,256.0,14.0,661.0825
50%,16.0,512.0,15.6,1031.945
75%,16.0,1000.0,15.6,1708.97
max,128.0,4000.0,18.0,7150.47


#### Train, validation, test split implementation

In [7]:
n = len(df)

n_val = int( n* 0.2)
n_test = int( n* 0.2)
n_train = n - n_test - n_val

In [10]:
df_train = df.iloc[n_train:]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

In [11]:
idx = np.arange(n)

In [16]:
np.random.seed(42)
np.random.shuffle(idx)

In [17]:
df_train = df.iloc[idx[n_train:]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

In [18]:
df_train

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
375,Acer Predator Helios 300 PH315-55-7174 Intel C...,New,Acer,Predator,Intel Core i7,16,512,SSD,RTX 3060,15.6,No,1779.00
407,Acer Predator Triton 300 SE PT316-51s-74LT Int...,New,Acer,Predator,Intel Core i7,16,512,SSD,RTX 3050,16.0,No,1589.01
872,ASUS VivoBook F1605PA-MB147 Intel Core i7-1137...,New,Asus,VivoBook,Intel Core i7,8,512,SSD,,16.0,No,599.99
1466,HP ZBook Firefly 14 G9 Intel Core i7-1260P/16 ...,New,HP,Zbook,Intel Core i7,16,512,SSD,,14.0,No,1778.00
1031,HP 15S-FQ3001NS Intel Celeron N4500/4GB/128GB ...,New,HP,15S,Intel Celeron,4,128,SSD,,15.6,No,395.40
...,...,...,...,...,...,...,...,...,...,...,...,...
1688,ASUS ROG Strix G17 G713RW-LL009 AMD Ryzen 9 69...,Refurbished,Asus,ROG,AMD Ryzen 9,32,1000,SSD,RTX 3070,17.3,No,2799.00
1235,ASUS P1511CEA-EJ1795X Intel Core i7-1165G7/8GB...,New,Asus,P1511,Intel Core i7,8,512,SSD,,15.6,No,840.24
189,Portátil Alurin Flex Advance Intel Core I5-115...,Refurbished,Alurin,Flex Advance,Intel Core i5,8,0,,,14.0,No,368.70
388,ASUS TUF Gaming F17 FX707VV4-HX025 Intel Core ...,New,Asus,TUF,Intel Core i9,32,1000,SSD,RTX 4060,17.3,No,2099.00


In [19]:
len(df_train)

864

In [21]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [24]:
df.isna().sum()

laptop             0
status             0
brand              0
model              0
cpu                0
ram                0
storage            0
storage_type      42
gpu             1371
screen             4
touch              0
final_price        0
dtype: int64

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   laptop        2160 non-null   object 
 1   status        2160 non-null   object 
 2   brand         2160 non-null   object 
 3   model         2160 non-null   object 
 4   cpu           2160 non-null   object 
 5   ram           2160 non-null   int64  
 6   storage       2160 non-null   int64  
 7   storage_type  2118 non-null   object 
 8   gpu           789 non-null    object 
 9   screen        2156 non-null   float64
 10  touch         2160 non-null   object 
 11  final_price   2160 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 202.6+ KB


In [37]:
df['model'].nunique(), df['cpu'].nunique(), df['touch'].nunique()

(121, 28, 2)

In [35]:
df['laptop'].nunique(), df['status'].nunique(), df['brand'].nunique()

(2160, 2, 27)

In [26]:
df.storage_type.head()

0    SSD
1    SSD
2    SSD
3    SSD
4    SSD
Name: storage_type, dtype: object

In [27]:
df.columns

Index(['laptop', 'status', 'brand', 'model', 'cpu', 'ram', 'storage',
       'storage_type', 'gpu', 'screen', 'touch', 'final_price'],
      dtype='object')

In [29]:
df_train.columns

Index(['laptop', 'status', 'brand', 'model', 'cpu', 'ram', 'storage',
       'storage_type', 'gpu', 'screen', 'touch', 'final_price'],
      dtype='object')

In [31]:
df_train['screen'].fillna(0, inplace=True)
df_train['gpu'].fillna(0, inplace=True)
df_train['storage_type'].fillna(0, inplace=True)

# Train linear regression model
model_0 = LinearRegression()
model_0.fit(df_train.drop('final_price',axis=1),df_train['final_price'])

# Predict and evaluate RMSE
y_pred_0 = model_0.predict(df_test.drop('final_price',axis=1))
rmse_0 = np.sqrt(mean_squared_error(y_valid_0, y_pred_0))


mean_value = train_fill_mean['column'].mean()

train_fill_mean['column'].fillna(mean_value, inplace=True)
valid_fill_mean['column'].fillna(mean_value, inplace=True)

# Features and target split
X_train_mean = train_fill_mean.drop('column', axis=1)
y_train_mean = train_fill_mean['column']
X_valid_mean = valid_fill_mean.drop('column', axis=1)
y_valid_mean = valid_fill_mean['column']

# Train linear regression model
model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train_mean)

# Predict and evaluate RMSE
y_pred_mean = model_mean.predict(X_valid_mean)
rmse_mean = np.sqrt(mean_squared_error(y_valid_mean, y_pred_mean))

# Round RMSE values to 2 decimal places
rmse_0 = round(rmse_0, 2)
rmse_mean = round(rmse_mean, 2)

# Compare the RMSE values
print(f"RMSE when filling with 0: {rmse_0}")
print(f"RMSE when filling with mean: {rmse_mean}")

# Determine which option gives better RMSE
if rmse_0 < rmse_mean:
    print("Filling with 0 gives a better RMSE.")
else:
    print("Filling with mean gives a better RMSE.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['screen'].fillna(0, inplace=True)


ValueError: could not convert string to float: 'Acer Predator Helios 300 PH315-55-7174 Intel Core i7-12700H/16GB/512GB SSD/RTX 3060/15.6"'