In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("../input/laptop_price_cleaned.csv")

In [3]:
target = "Price_euros"

y = df[target]
X = df.drop(target, axis=1)

In [4]:
categorical_columns = X.select_dtypes(include=['category', 'object']).columns.tolist()
continuous_columns = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [5]:
print("Categorical features and their unique values")
for c in categorical_columns:
    print(f"{c}:", X[c].nunique())    

Categorical features and their unique values
Company: 9
Product: 617
TypeName: 6
Ram: 9
Memory: 39
OpSys: 6
Model: 225
DisplayType: 23
CpuBrand: 2
CpuProduct: 92
PrimaryMemory: 30
PrimaryMemSize: 14
PrimaryMemType: 7
Gpu_Brand: 3
Gpu_Product: 24


In [6]:
continuous_columns

['Inches',
 'Weight_kg',
 'LogPrice',
 'Inches_Binned',
 'Touchscreen',
 'IPS Panel',
 'RetinaDisplay',
 'ResolutionWidth',
 'ResolutionHeight',
 'ResolutionWidthCat',
 'CpuClockSpeed_GHz',
 'Ram_GB',
 'SecondaryMemory']

#### Dropping Columns

In [7]:
#cols = ["Product", "Model", "CpuProduct", "LogPrice", "Price_euros", "ResolutionWidth"] # Weight Gpu

cols = ["Product", "Model", "CpuProduct", "LogPrice", "Price_euros", "ResolutionWidth", "Inches"] # Weight Gpu

df_ = df.drop(cols, axis=1)

In [8]:
df_.columns

Index(['Company', 'TypeName', 'Ram', 'Memory', 'OpSys', 'Weight_kg',
       'Inches_Binned', 'DisplayType', 'Touchscreen', 'IPS Panel',
       'RetinaDisplay', 'ResolutionHeight', 'ResolutionWidthCat', 'CpuBrand',
       'CpuClockSpeed_GHz', 'Ram_GB', 'PrimaryMemory', 'SecondaryMemory',
       'PrimaryMemSize', 'PrimaryMemType', 'Gpu_Brand', 'Gpu_Product'],
      dtype='object')

In [9]:
categorical_columns = df_.select_dtypes(include=['category', 'object']).columns.tolist()


continuous_columns = df_.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [10]:
#categorical_columns = ['Company','TypeName', 'Ram', 'Memory', 'OpSys',
# 'DisplayType','CpuBrand','PrimaryMemory','SecondaryMemory','Gpu_Brand',
# 'Gpu_Product', "Inches_Binned", "ResolutionWidthCat"]

In [11]:
categorical_columns = ['TypeName', 'Ram', 'OpSys', 'ResolutionWidthCat', 'Inches_Binned', 'CpuBrand', 'Gpu_Brand', 'SecondaryMemory',
                      #"PrimaryMemSize"]#, 
                        "PrimaryMemType"]

#'DisplayType','PrimaryMemory',]
# 'Gpu_Product']

In [12]:
df.columns

Index(['Company', 'Product', 'TypeName', 'Inches', 'Ram', 'Memory', 'OpSys',
       'Weight_kg', 'Price_euros', 'LogPrice', 'Model', 'Inches_Binned',
       'DisplayType', 'Touchscreen', 'IPS Panel', 'RetinaDisplay',
       'ResolutionWidth', 'ResolutionHeight', 'ResolutionWidthCat', 'CpuBrand',
       'CpuProduct', 'CpuClockSpeed_GHz', 'Ram_GB', 'PrimaryMemory',
       'SecondaryMemory', 'PrimaryMemSize', 'PrimaryMemType', 'Gpu_Brand',
       'Gpu_Product'],
      dtype='object')

In [13]:
df.Gpu_Brand.value_counts()

Gpu_Brand
Intel     722
Nvidia    400
AMD       180
Name: count, dtype: int64

In [14]:
df.Gpu_Brand.value_counts()

Gpu_Brand
Intel     722
Nvidia    400
AMD       180
Name: count, dtype: int64

In [15]:
continuous_columns = ['Weight_kg', #'ResolutionHeight',
 'CpuClockSpeed_GHz', 'Ram_GB']

In [16]:
binary_cols = ["Touchscreen", "IPS Panel", "RetinaDisplay"]

In [17]:
df_[continuous_columns]

Unnamed: 0,Weight_kg,CpuClockSpeed_GHz,Ram_GB
0,1.37,2.3,8
1,1.34,1.8,8
2,1.86,2.5,8
3,1.83,2.7,16
4,1.37,3.1,8
...,...,...,...
1297,1.80,2.5,4
1298,1.30,2.5,16
1299,1.50,1.6,2
1300,2.19,2.5,6


#### Dummy Encoding

In [18]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

In [19]:
df_encoded = pd.DataFrame(encoder.fit_transform(df_[categorical_columns]))
df_encoded.columns = encoder.get_feature_names_out(categorical_columns)

In [20]:
df_encoded2 = pd.DataFrame(encoder.fit_transform(df_[binary_cols]))
df_encoded2.columns = encoder.get_feature_names_out(binary_cols)

In [21]:
df_encoded = pd.concat([df_encoded, df_encoded2], axis=1)

#### Scaling

In [22]:
scaler = StandardScaler()

In [23]:
for col in binary_cols:
    if col in continuous_columns:
        continuous_columns.remove(col)

In [24]:
df_scaled = scaler.fit_transform(df_[continuous_columns])

In [25]:
df_scaled = pd.DataFrame(df_scaled, columns=continuous_columns)

#### Merge dfs

In [26]:
X = pd.merge(df_scaled, df_encoded, left_index=True, right_index=True)
#X = df_scaled
X

Unnamed: 0,Weight_kg,CpuClockSpeed_GHz,Ram_GB,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation,Ram_16GB,Ram_24GB,...,SecondaryMemory_1,PrimaryMemType_Flash Storage +,PrimaryMemType_HDD,PrimaryMemType_HDD +,PrimaryMemType_Hybrid,PrimaryMemType_SSD,PrimaryMemType_SSD +,Touchscreen_1,IPS Panel_1,RetinaDisplay_1
0,-1.006613,0.001972,-0.075850,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,-1.051725,-0.985638,-0.075850,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.269791,0.397016,-0.075850,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.314903,0.792060,1.497958,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,-1.006613,1.582148,-0.075850,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,-0.360015,0.397016,-0.862754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1298,-1.111874,0.397016,1.497958,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1299,-0.811130,-1.380682,-1.256206,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1300,0.226435,0.397016,-0.469302,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Test train split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Modelling

In [28]:
model = LinearRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

absolute_error = np.abs(y_pred - y_test)
mae = np.mean(absolute_error)
print("Mean Absolute Error:", mae)


print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

Mean Squared Error: 120896.06176082518
Mean Absolute Error: 241.28632575965366
Coefficients: [ 4.34779399e+01  1.14261688e+02  2.62515987e+02 -9.84427822e+01
 -1.19263079e+01 -1.80584286e+02  9.50321084e+01  6.76920344e+02
  3.10378396e+02  4.83185475e+02  1.80103149e+02  5.49357926e+02
  2.11695967e+02 -2.26216807e+02  7.40438188e+01  2.47584249e+02
 -6.76955211e+01 -1.36062092e+02  6.87884903e+01  3.60580365e+02
  4.95062286e+02 -3.52090870e+02 -2.91138615e+02  8.08130286e+01
  1.13380039e+02  2.94723993e+02  6.52049510e+02  4.41751908e+02
  6.34297699e+02  8.68327118e+02  1.25094005e+03  1.30539057e+02
  6.05381804e+02  2.47861550e+02  1.49393224e+02  1.70606344e+02
 -1.32750530e+01  2.84346657e+02  2.06845241e+02  8.49433500e+01
  6.83772382e+01  7.63570991e+01 -9.09494702e-13  7.63715899e+01
 -4.93637953e+01  2.36962681e+01  2.01398775e+02  1.25720894e+02
  3.05179524e+00  1.33388124e+01 -8.11347719e+02]
Intercept: 312.3919739839324
R^2 Score: 0.7597975093740013
