In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("input/laptop_price_cleaned.csv")

In [3]:
target = "Price_euros"

y = df[target]
X = df.drop(target, axis=1)

In [4]:
categorical_columns = X.select_dtypes(include=['category', 'object']).columns.tolist()
continuous_columns = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [5]:
print("Categorical features and their unique values")
for c in categorical_columns:
    print(f"{c}:", X[c].nunique())    

Categorical features and their unique values
Company: 9
Product: 618
TypeName: 6
Ram: 9
Memory: 39
Gpu: 110
OpSys: 9
Weight: 179
Model: 225
DisplayType: 23
CpuBrand: 3
CpuProduct: 93
PrimaryMemory: 30
SecondaryMemory: 7


In [6]:
continuous_columns

['Inches',
 'LogPrice',
 'Inches_Binned',
 'Touchscreen',
 'IPS Panel',
 'RetinaDisplay',
 'ResolutionWidth',
 'ResolutionHeight',
 'ResolutionWidthCat',
 'CpuClockSpeed_GHz',
 'Ram_GB']

#### Dropping Columns

In [7]:
cols = ["Product", "Weight", "Model", "Gpu", "CpuProduct", "LogPrice", "Price_euros"]

df_ = df.drop(cols, axis=1)

In [8]:
categorical_columns = df_.select_dtypes(include=['category', 'object']).columns.tolist()
continuous_columns = df_.select_dtypes(include=['float64', 'int64']).columns.tolist()

#### Dummy Encoding

In [9]:
df_encoded = pd.get_dummies(df_, columns=categorical_columns, drop_first=True)

#### Scaling

In [10]:
scaler = StandardScaler()

In [11]:
df_scaled = scaler.fit_transform(df_[continuous_columns])

In [12]:
df_scaled = pd.DataFrame(df_scaled, columns=continuous_columns)

#### Merge dfs

In [13]:
X = pd.merge(df_scaled, df_encoded, left_index=True, right_index=True)
X

Unnamed: 0,Inches_x,Inches_Binned_x,Touchscreen_x,IPS Panel_x,RetinaDisplay_x,ResolutionWidth_x,ResolutionHeight_x,ResolutionWidthCat_x,CpuClockSpeed_GHz_x,Ram_GB_x,...,PrimaryMemory_64GB Flash Storage,PrimaryMemory_64GB Flash Storage +,PrimaryMemory_64GB SSD,PrimaryMemory_8GB SSD,SecondaryMemory_1.0TB Hybrid,SecondaryMemory_1TB HDD,SecondaryMemory_256GB SSD,SecondaryMemory_2TB HDD,SecondaryMemory_500GB HDD,SecondaryMemory_512GB SSD
0,-1.204407,-1.27029,-0.415713,1.603079,8.697532,1.345362,1.860586,1.345362,0.002426,-0.075195,...,False,False,False,False,False,False,False,False,False,False
1,-1.204407,-1.27029,-0.415713,-0.623799,-0.114975,-0.919776,-0.600648,-0.919776,-0.985431,-0.075195,...,False,False,False,False,False,False,False,False,False,False
2,0.408772,0.45207,-0.415713,-0.623799,-0.114975,0.050997,0.032241,0.050997,0.397569,-0.075195,...,False,False,False,False,False,False,False,False,False,False
3,0.268495,0.45207,-0.415713,1.603079,8.697532,1.992544,2.563795,1.992544,0.792712,1.498767,...,False,False,False,False,False,False,False,False,False,False
4,-1.204407,-1.27029,-0.415713,1.603079,8.697532,1.345362,1.860586,1.345362,1.582997,-0.075195,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,-0.713439,-0.40911,2.405506,1.603079,-0.114975,0.050997,0.032241,0.050997,0.397569,-0.862176,...,False,False,False,False,False,False,False,False,False,False
1299,-1.204407,-1.27029,2.405506,1.603079,-0.114975,2.639726,2.563795,2.639726,0.397569,1.498767,...,False,False,False,False,False,False,False,False,False,False
1300,-0.713439,-0.40911,-0.415713,-0.623799,-0.114975,-1.069437,-1.064766,-1.069437,-1.380574,-1.255667,...,True,False,False,False,False,False,False,False,False,False
1301,0.408772,0.45207,-0.415713,-0.623799,-0.114975,-1.069437,-1.064766,-1.069437,0.397569,-0.468686,...,False,False,False,False,False,False,False,False,False,False


#### Test train split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Modelling

In [15]:
model = LinearRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

Mean Squared Error: 115901.1140969756
Coefficients: [ 5.72010743e+01 -6.99942964e+01 -3.31906570e+01  1.96628189e+01
  2.59179663e+01  5.95240433e-04 -2.33709080e-03  5.95240396e-04
  8.24395696e+01  8.05863794e+00  8.15548112e+01 -8.12772184e+01
 -1.17646518e+01  8.82976644e+00  2.94104352e+00  2.94317318e-01
 -6.64692486e-01  2.94317318e-01  4.17264696e+01  4.09597451e+01
  4.23287916e+00  7.45553447e+01  1.34952429e+02  1.68532915e+02
  1.12749215e+02  1.14893781e+02  3.79901029e+02  2.34081541e+02
  5.58844387e+00 -1.29965231e+02 -2.20800370e+02  1.28133466e+01
  5.34022984e+02  5.72221630e+01  1.90210095e+02  5.00462192e+01
  4.46892780e+02  6.24863418e+01 -2.04449894e+02 -8.08084881e+01
  6.53522914e+01  1.53454776e+01  1.69192404e+02 -5.12856880e+01
  8.27065085e+01 -4.60788847e+01  1.63904680e+02 -1.37619611e+02
 -1.02146110e+02  1.07582399e+02  5.67295166e+01 -7.36807272e+01
  3.92239592e+02  3.63579143e+02  9.07209721e+02  3.44644522e+02
  1.75209371e+02  4.52945291e+02 -1.00