In [1]:
# import pandas library
import numpy as np
import pandas as pd

In [2]:
# load encoded dataset
df = pd.read_csv("../data/encoded/laptops_encoded.csv")

In [3]:
# Make a copy for feature engineering and preview
df_fe = df.copy()
df_fe.head()

Unnamed: 0,Product,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,...,OpSys_Clean_MacOS,OpSys_Clean_No OS,OpSys_Clean_Windows,Cpu_Brand_Intel,Cpu_Brand_Samsung,Gpu_Brand_ARM,Gpu_Brand_Intel,Gpu_Brand_Nvidia,Cpu_Name_TE,Gpu_Name_TE
0,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,...,True,False,False,True,False,False,True,False,1391.948333,1764.01125
1,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94,...,True,False,False,True,False,False,True,False,1391.948333,1022.728
2,250 G6,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0,...,False,True,False,True,False,False,True,False,919.318083,1141.089823
3,MacBook Pro,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45,...,True,False,False,True,False,False,False,False,2493.8475,2537.45
4,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,...,True,False,False,True,False,False,True,False,1391.948333,1921.8


In [4]:
# Creating a new feature 'PPI' (Pixels Per Inch)
df_fe["PPI"] = ((df_fe["Screen_Width"]**2 + df_fe["Screen_Height"]**2) ** 0.5) / df_fe["Inches"]

In [5]:
# High RAM Flag
df_fe["Is_High_RAM"] = (df_fe["Ram (GB)"] >= 16).astype(int)

In [6]:
# SSD flag
df_fe["Is_SSD"] = (df_fe["SSD (GB)"] > 0).astype(int)

In [7]:
# Storage type
def storage_type(row):
    if row["SSD (GB)"] > 0 and row["HDD (GB)"] == 0:
        return "SSD only"
    elif row["HDD (GB)"] > 0 and row["SSD (GB)"] == 0:
        return "HDD only"
    elif row["SSD (GB)"] > 0 and row["HDD (GB)"] > 0:
        return "Hybrid"
    elif row["Flash Storage (GB)"] > 0:
        return "Flash only"
    else:
        return "Unknown"

df_fe["Storage_Type"] = df_fe.apply(storage_type, axis=1)

In [8]:
# CPU performance class
def cpu_class(row):
    if row["Cpu_Speed_GHz"] >= 3.0 or row["Cpu_Name_TE"] > 2000:
        return "High-end"
    elif row["Cpu_Speed_GHz"] >= 2.0 or row["Cpu_Name_TE"] > 1200:
        return "Mid-range"
    else:
        return "Low-end"

# Create categorical version
df_fe["Cpu_Performance_Class"] = df_fe.apply(cpu_class, axis=1)

# Create numeric version (for ML)
cpu_map = {"Low-end": 0, "Mid-range": 1, "High-end": 2}
df_fe["Cpu_Performance_Class_Num"] = df_fe["Cpu_Performance_Class"].map(cpu_map)

In [9]:
def gpu_class(te_value):
    if te_value > 2000:
        return "High-end"
    elif te_value > 1200:
        return "Mid-range"
    else:
        return "Low-end"

# Create categorical version
df_fe["Gpu_Performance_Class"] = df_fe["Gpu_Name_TE"].apply(gpu_class)

# Create numeric version (for ML)
gpu_map = {"Low-end": 0, "Mid-range": 1, "High-end": 2}
df_fe["Gpu_Performance_Class_Num"] = df_fe["Gpu_Performance_Class"].map(gpu_map)

In [10]:
# Dedicated GPU
df_fe["Gpu_Dedicated"] = df_fe["Gpu"].apply(
    lambda x: 1 if ("Nvidia" in x or "AMD" in x) else 0
)

In [11]:
# Preview the new features created
df_fe[["PPI", "Is_High_RAM", "Is_SSD", "Storage_Type", "Cpu_Performance_Class", "Gpu_Dedicated"]].head()

Unnamed: 0,PPI,Is_High_RAM,Is_SSD,Storage_Type,Cpu_Performance_Class,Gpu_Dedicated
0,226.983005,0,1,SSD only,Mid-range,0
1,127.67794,0,0,Flash only,Mid-range,0
2,141.211998,0,1,SSD only,Mid-range,0
3,220.534624,1,1,SSD only,High-end,1
4,226.983005,0,1,SSD only,High-end,0


In [12]:
df_fe.columns

Index(['Product', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu',
       'OpSys', 'Weight', 'Price_euros', 'Ram (GB)', 'Weight (kg)', 'HDD (GB)',
       'SSD (GB)', 'Hybrid (GB)', 'Flash Storage (GB)', 'Total_Storage (GB)',
       'Screen_Width', 'Screen_Height', 'Touchscreen', 'Cpu_Speed_GHz',
       'Company_Apple', 'Company_Asus', 'Company_Chuwi', 'Company_Dell',
       'Company_Fujitsu', 'Company_Google', 'Company_HP', 'Company_Huawei',
       'Company_LG', 'Company_Lenovo', 'Company_MSI', 'Company_Mediacom',
       'Company_Microsoft', 'Company_Razer', 'Company_Samsung',
       'Company_Toshiba', 'Company_Vero', 'Company_Xiaomi', 'TypeName_Gaming',
       'TypeName_Netbook', 'TypeName_Notebook', 'TypeName_Ultrabook',
       'TypeName_Workstation', 'OpSys_Clean_Chrome OS', 'OpSys_Clean_Linux',
       'OpSys_Clean_MacOS', 'OpSys_Clean_No OS', 'OpSys_Clean_Windows',
       'Cpu_Brand_Intel', 'Cpu_Brand_Samsung', 'Gpu_Brand_ARM',
       'Gpu_Brand_Intel', 'Gpu_Brand_Nvidi

In [13]:
# Make a copy of the feature-engineered dataset for modeling
df_model = df_fe.copy()

In [14]:
# Dropping redundant columns not needed for modeling
drop_cols = [
    "Product", "Cpu", "Gpu", "OpSys", "ScreenResolution",  # raw text features
    "Ram", "Memory", "Weight",  # original text/duplicate features
    "Storage_Type", "Cpu_Performance_Class", "Gpu_Performance_Class" # they have numeric equivalents
]

df_model = df_model.drop(columns=drop_cols)

In [15]:
df_model.head()

Unnamed: 0,Inches,Price_euros,Ram (GB),Weight (kg),HDD (GB),SSD (GB),Hybrid (GB),Flash Storage (GB),Total_Storage (GB),Screen_Width,...,Gpu_Brand_Intel,Gpu_Brand_Nvidia,Cpu_Name_TE,Gpu_Name_TE,PPI,Is_High_RAM,Is_SSD,Cpu_Performance_Class_Num,Gpu_Performance_Class_Num,Gpu_Dedicated
0,13.3,1339.69,8,1.37,0.0,128.0,0.0,0.0,128.0,2560,...,True,False,1391.948333,1764.01125,226.983005,0,1,1,1,0
1,13.3,898.94,8,1.34,0.0,0.0,0.0,128.0,128.0,1440,...,True,False,1391.948333,1022.728,127.67794,0,0,1,0,0
2,15.6,575.0,8,1.86,0.0,256.0,0.0,0.0,256.0,1920,...,True,False,919.318083,1141.089823,141.211998,0,1,1,0,0
3,15.4,2537.45,16,1.83,0.0,512.0,0.0,0.0,512.0,2880,...,False,False,2493.8475,2537.45,220.534624,1,1,2,2,1
4,13.3,1803.6,8,1.37,0.0,256.0,0.0,0.0,256.0,2560,...,True,False,1391.948333,1921.8,226.983005,0,1,2,1,0


In [16]:
# Save feature engineered data
df_model.to_csv("../data/final/laptops_feature_engineered.csv", index=False)