Try to build a neural network to predict price of a laptop. Will use a simple linear regression model and compare accuracy later 

In [10]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


In [11]:
df = pd.read_csv('laptop_clean_dataset.csv')
print(df.columns)
print(df.dtypes)

Index(['Company', 'TypeName', 'Inches', 'Weight', 'screen_resolution',
       'display_type', 'processor_speed', 'processor_brand', 'processor_type',
       'ssd_storage', 'hdd_storage', 'flash_storage', 'ram', 'os', 'Price'],
      dtype='object')
Company               object
TypeName              object
Inches               float64
Weight                object
screen_resolution     object
display_type          object
processor_speed      float64
processor_brand       object
processor_type        object
ssd_storage            int64
hdd_storage           object
flash_storage          int64
ram                    int64
os                    object
Price                float64
dtype: object


Need to convert Weight to numerical data , also need to do one hot encoding on categorical features

In [12]:
df['Weight']= pd.to_numeric(df['Weight'], errors='coerce').astype('float64')

In [13]:
df['Weight']=df['Weight'].fillna(1.18) #from, google

Need to make HDD storage a numerical value

In [14]:
def correct_values(val):
    val = str(val).upper().strip()
  
    if "TB" in val:
        num = float(val.replace('TB', '').replace(' ', ''))
        num = num * 1024
    else:
        num = float(val.replace(' ', ''))

    return num

df['hdd_storage'] = df['hdd_storage'].apply(correct_values)
print(df['hdd_storage'])


print(df.dtypes)

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
         ...  
1238     500.0
1239       0.0
1240       0.0
1241       0.0
1242    1024.0
Name: hdd_storage, Length: 1243, dtype: float64
Company               object
TypeName              object
Inches               float64
Weight               float64
screen_resolution     object
display_type          object
processor_speed      float64
processor_brand       object
processor_type        object
ssd_storage            int64
hdd_storage          float64
flash_storage          int64
ram                    int64
os                    object
Price                float64
dtype: object


Many of the types are categorical values but they have more categories than 1 , so using one hot encoder here , seems helpful  

In [15]:

categorical_cols = ['Company','TypeName','screen_resolution','display_type','processor_brand','processor_type','os']
numeric_cols = ['Inches','Weight','processor_speed','ssd_storage','hdd_storage','flash_storage','ram','Price']


Y = df['Price']
X = pd.get_dummies(df, columns=categorical_cols, dtype=int)
X.drop(columns=['Price'], inplace=True)
print(X.shape)
print(X)
print(X.shape, ' ', Y.shape)


(1243, 151)
      Inches  Weight  processor_speed  ssd_storage  hdd_storage  \
0       13.3    1.37              2.3          128          0.0   
1       13.3    1.34              1.8            0          0.0   
2       15.6    1.86              2.5          256          0.0   
3       15.4    1.83              2.7          512          0.0   
4       13.3    1.37              3.1          256          0.0   
...      ...     ...              ...          ...          ...   
1238    15.6    2.20              2.5            0        500.0   
1239    14.0    1.80              2.5          128          0.0   
1240    13.3    1.30              2.5          512          0.0   
1241    14.0    1.50              1.6            0          0.0   
1242    15.6    2.19              2.5            0       1024.0   

      flash_storage  ram  Company_Acer  Company_Apple  Company_Asus  ...  \
0                 0    8             0              1             0  ...   
1               128    8       

Data is ready for usage 

Splitting the data 

In [16]:
X_train , X_test , Y_train, Y_test = train_test_split(X,Y, test_size=0.15, random_state=42)

In [17]:
import torch

In [19]:
model1 = LinearRegression()
model2 = torch.nn.Sequential(
    torch.nn.Linear(X_train.shape[1] , 30),
    torch.nn.ReLU(),
    torch.nn.Linear(30,10),
    torch.nn.ReLU(),
    torch.nn.Linear(10,1)
)
model3 = torch.nn.Sequential(
    torch.nn.Linear(X_train.shape[1] , 20),
    torch.nn.ReLU(),
    torch.nn.Linear(30,1)
)
model4 = torch.nn.Sequential(
    torch.nn.Linear(X_train.shape[1] , 70),
    torch.nn.ReLU(),
    torch.nn.Linear(30,10),
    torch.nn.ReLU(),
    torch.nn.Linear(15,5),
    torch.nn.ReLU(),
    torch.nn.Linear(5,1)
)

Function to train model  

In [None]:
def train_one(model, train_dl, val_dl, max_epochs=500, lr=1e-3, patience=20):
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn =torch.nn.MSELoss()
    best_val = float("inf")
    best_state = None
    no_improve = 0

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_dl: # training mode 
            opt.zero_grad() #zero gradient
            pred = model(xb).squeeze(1) #Predicton
            loss = loss_fn(pred, yb) #COmpute the loss
            loss.backward() #compute the gradients
            opt.step()   # update the weights

        model.eval() 
        val_loses=[]
        with torch.no_grad(): # no gradient tracking
            for xb , yb in val_dl:
                pred = model(xb).squeeze(1)
                val_loses.append(loss_fn(pred, yb).item())
        val_mse = np.mean(val_loses)
        if val_mse < best_val - 1e-6:
            best_val = val_mse
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                break
    if best_state is not None:
        model.load_state_dict(best_state)
    return np.sqrt(best_val)  # best RMSE


