In [1]:
# import relevant libraries
import pandas as pd
from sklearn.model_selection import train_test_split as split
from sklearn.linear_model import LinearRegression

In [2]:
# read the final feature-engineered dataset
df = pd.read_csv("../data/final/laptops_feature_engineered.csv")

In [3]:
df.head()

Unnamed: 0,Inches,Price_euros,Ram (GB),Weight (kg),HDD (GB),SSD (GB),Hybrid (GB),Flash Storage (GB),Total_Storage (GB),Screen_Width,...,Gpu_Brand_Intel,Gpu_Brand_Nvidia,Cpu_Name_TE,Gpu_Name_TE,PPI,Is_High_RAM,Is_SSD,Cpu_Performance_Class_Num,Gpu_Performance_Class_Num,Gpu_Dedicated
0,13.3,1339.69,8,1.37,0.0,128.0,0.0,0.0,128.0,2560,...,True,False,1391.948333,1764.01125,226.983005,0,1,1,1,0
1,13.3,898.94,8,1.34,0.0,0.0,0.0,128.0,128.0,1440,...,True,False,1391.948333,1022.728,127.67794,0,0,1,0,0
2,15.6,575.0,8,1.86,0.0,256.0,0.0,0.0,256.0,1920,...,True,False,919.318083,1141.089823,141.211998,0,1,1,0,0
3,15.4,2537.45,16,1.83,0.0,512.0,0.0,0.0,512.0,2880,...,False,False,2493.8475,2537.45,220.534624,1,1,2,2,1
4,13.3,1803.6,8,1.37,0.0,256.0,0.0,0.0,256.0,2560,...,True,False,1391.948333,1921.8,226.983005,0,1,2,1,0


In [4]:
# Separate independent variables (X) and dependent variable (y)
# Features (X) and target (y)
X = df.drop(["Price_euros"], axis=1)
y = df["Price_euros"]

In [5]:
print(X.shape, y.shape)

(1303, 53) (1303,)


In [6]:
X_train, X_test, y_train, y_test = split(
    X, y, test_size=0.25, random_state=9
)

In [7]:
# shape of training data
print(X_train.shape, y_train.shape)

(977, 53) (977,)


In [8]:
# shape of test data
print(X_test.shape, y_test.shape)

(326, 53) (326,)


In [9]:
# initialize linear regression model (lrm)
lin_reg_mod = LinearRegression()

In [10]:
# Train the model on the training data
lin_reg_mod.fit(X_train, y_train)
print("Model training completed.")

Model training completed.


In [11]:
# Model's coefficient and intercept
print(f'Model Coefficient: {lin_reg_mod.coef_}')
print(f'Model Intercept: {lin_reg_mod.intercept_}')

Model Coefficient: [-2.55298852e+01  1.93977541e+01  7.14249504e+01 -2.85588336e-01
  2.53100455e-01 -2.75234078e-01  6.05514251e-01  2.97792293e-01
  9.66063426e-02  1.29407653e-01  3.77947793e+00  6.42532718e+01
 -7.92939408e+00  4.69694335e+01 -1.41066463e+00  1.04368078e+02
 -7.40479710e+01 -4.29077011e+00  1.22739388e+02 -3.97159706e+01
  6.96628524e+02  1.00781440e+02  1.36423376e+02  7.86574630e+01
 -6.69262798e+01  5.04854504e+02  2.47198080e+02  2.48979456e+02
  6.23403336e+01  1.28660993e+02 -1.43888254e+02 -1.06409891e+00
 -1.26596997e+02  1.24475422e+02  2.23468193e+02  1.37461930e+02
  2.68559608e+01 -7.92939408e+00 -4.55011249e+01  1.25521628e+02
 -5.10905949e+01  2.84217094e-13  7.10542736e-14  1.18672429e+01
 -1.21046553e+00  3.16426467e-01  4.32949476e-01 -2.07574711e-01
  1.11724240e+02  1.34539223e+01 -7.17741441e+01 -3.70233537e+01
 -1.18672429e+01]
Model Intercept: -239.47383017866264


In [12]:
# Predictions on training data
y_train_pred = lin_reg_mod.predict(X_train)
print(f'Predicted training data: {y_train_pred[:10]}')
print(f'\nActual training data: {y_train[:10].values}')

Predicted training data: [ 197.97355346  625.69955959  652.68080325 1864.25712955 1611.80733617
  790.42436572  959.12161762  285.82890907 1737.27085086  494.77990165]

Actual training data: [ 369.    459.    668.48 1499.   1323.   1060.    609.    349.   2040.
  581.9 ]


In [13]:
# Predictions on test data
y_test_pred = lin_reg_mod.predict(X_test)
print(f'Predicted test data: {y_test_pred[:10]}')
print(f'\nActual test data: {y_test[:10].values}')

Predicted test data: [ 720.78225305  944.84154565  910.86585279 1198.73854547  885.91503414
 1562.88357265  973.02582634  608.76640431 1219.21719047  716.16931924]

Actual test data: [ 655.01  794.    955.   1480.    575.   1099.   1126.71  415.   1169.
  849.  ]


In [14]:
import joblib

In [15]:
# Save trained model
joblib.dump(lin_reg_mod, "../models/laptop_price_model.pkl")

['../models/laptop_price_model.pkl']

In [16]:
# Save feature columns
joblib.dump(X_train.columns.tolist(), "../models/model_features.pkl")

['../models/model_features.pkl']

In [17]:
# Save train-test split
joblib.dump((X_train, X_test, y_train, y_test), "../models/train_test_split.pkl")

['../models/train_test_split.pkl']