# Import Libraries & dataset

In [34]:
import pandas as pd
import numpy as np
import re

In [35]:
data  = pd.read_csv('../Data/laptop_price - dataset.csv')

# Basic Data Exploration

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              1275 non-null   object 
 1   Product              1275 non-null   object 
 2   TypeName             1275 non-null   object 
 3   Inches               1275 non-null   float64
 4   ScreenResolution     1275 non-null   object 
 5   CPU_Company          1275 non-null   object 
 6   CPU_Type             1275 non-null   object 
 7   CPU_Frequency (GHz)  1275 non-null   float64
 8   RAM (GB)             1275 non-null   int64  
 9   Memory               1275 non-null   object 
 10  GPU_Company          1275 non-null   object 
 11  GPU_Type             1275 non-null   object 
 12  OpSys                1275 non-null   object 
 13  Weight (kg)          1275 non-null   float64
 14  Price (Euro)         1275 non-null   float64
dtypes: float64(4), int64(1), object(10)
me

In [37]:
data.describe()

Unnamed: 0,Inches,CPU_Frequency (GHz),RAM (GB),Weight (kg),Price (Euro)
count,1275.0,1275.0,1275.0,1275.0,1275.0
mean,15.022902,2.30298,8.440784,2.040525,1134.969059
std,1.42947,0.503846,5.097809,0.669196,700.752504
min,10.1,0.9,2.0,0.69,174.0
25%,14.0,2.0,4.0,1.5,609.0
50%,15.6,2.5,8.0,2.04,989.0
75%,15.6,2.7,8.0,2.31,1496.5
max,18.4,3.6,64.0,4.7,6099.0


In [None]:
for col in data.select_dtypes(include=['object']).columns:
  print(f"Unique values for column '{col}':")
  print(data[col].unique())
  print("-" * 20)

In [39]:
data.duplicated().any()

np.False_

In [40]:
data.isnull().sum() / len(data) *100

Company                0.0
Product                0.0
TypeName               0.0
Inches                 0.0
ScreenResolution       0.0
CPU_Company            0.0
CPU_Type               0.0
CPU_Frequency (GHz)    0.0
RAM (GB)               0.0
Memory                 0.0
GPU_Company            0.0
GPU_Type               0.0
OpSys                  0.0
Weight (kg)            0.0
Price (Euro)           0.0
dtype: float64

# Feature Engneering

## Converting Price

In [41]:
exchange_rate = 3.97  # 1 EUR to SAR

data['Price (SAR)'] = data['Price (Euro)'] * exchange_rate

data = data.drop('Price (Euro)', axis=1)

In [42]:
data.sample()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,CPU_Company,CPU_Type,CPU_Frequency (GHz),RAM (GB),Memory,GPU_Company,GPU_Type,OpSys,Weight (kg),Price (SAR)
272,Dell,Inspiron 3576,Notebook,15.6,Full HD 1920x1080,Intel,Core i7 8550U,1.8,8,256GB SSD,AMD,Radeon 520,Linux,2.14,2918.2279


## Splitting Memory column

In [43]:
def memory_split(memory):
    try:
        if '+' in memory:
            split_mem = memory.split('+')
            mem1 = split_mem[0].strip()
            mem2 = split_mem[1].strip()
            mem_type = mem1.split(' ')[1] + '+' + mem2.split(' ')[1]
            mem1_capacity = int(re.findall(r'\d+', mem1)[0])
            mem2_capacity = int(re.findall(r'\d+', mem2)[0])
            if 'GB' in mem1 and 'GB' in mem2:
              total_capacity = mem1_capacity + mem2_capacity
            elif 'TB' in mem1 and 'GB' in mem2:
              total_capacity = mem1_capacity * 1024 + mem2_capacity
            elif 'GB' in mem1 and 'TB' in mem2:
              total_capacity = mem1_capacity + mem2_capacity * 1024
            else:
              total_capacity = mem1_capacity * 1024 + mem2_capacity * 1024
            return total_capacity, mem_type
        else:
            capacity = re.findall(r'\d+', memory)[0]
            mem_type = memory.split(' ')[1]
            if 'TB' in memory:
              capacity = int(capacity) * 1024
            return int(capacity), mem_type

    except:
        return np.nan, np.nan


data['Memory Capacity'], data['Memory Type'] = zip(*data['Memory'].apply(memory_split))

In [44]:
memory_cols = ['Memory Capacity', 'Memory Type']
memory_index = data.columns.get_loc('Memory')
for col in memory_cols:
    data.insert(memory_index + 1, col, data.pop(col))
data.drop(['Memory'],axis= 1,inplace=True)

In [45]:
data.sample(10)

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,CPU_Company,CPU_Type,CPU_Frequency (GHz),RAM (GB),Memory Type,Memory Capacity,GPU_Company,GPU_Type,OpSys,Weight (kg),Price (SAR)
1263,Acer,Aspire ES1-531,Notebook,15.6,1366x768,Intel,Celeron Dual Core N3060,1.6,4,HDD,500,Intel,HD Graphics 400,Linux,2.4,1147.33
649,Lenovo,Thinkpad X270,Ultrabook,12.5,IPS Panel Full HD 1920x1080,Intel,Core i7 7500U,2.7,8,SSD,256,Intel,HD Graphics 620,Windows 10,1.36,6288.48
1086,HP,ZBook 15u,Workstation,15.6,Full HD 1920x1080,Intel,Core i7 6500U,2.5,8,SSD,256,AMD,FirePro W4190M,Windows 7,1.9,5935.15
123,HP,Probook 470,Notebook,17.3,Full HD 1920x1080,Intel,Core i7 8550U,1.8,16,SSD,512,Nvidia,GeForce 930MX,Windows 10,2.5,5045.87
548,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel,Core i3 6006U,2.0,4,HDD,500,Intel,HD Graphics 520,Windows 10,1.86,1576.09
211,Asus,ROG GL703VD-GC028T,Gaming,17.3,Full HD 1920x1080,Intel,Core i7 7700HQ,2.8,16,SSD+HDD,1280,Nvidia,GeForce GTX 1050,Windows 10,2.9,5585.79
866,Toshiba,Portégé Z30-C-188,Ultrabook,13.3,1366x768,Intel,Core i5 6200U,2.3,8,SSD,256,Intel,HD Graphics 520,Windows 10,1.2,4347.15
990,HP,EliteBook Folio,Netbook,12.5,Full HD 1920x1080,Intel,Core M 6Y75,1.2,8,SSD,512,Intel,HD Graphics 515,Windows 10,0.97,7574.76
348,Asus,VivoBook Flip,2 in 1 Convertible,11.6,Touchscreen 1366x768,Intel,Celeron Dual Core N3350,1.1,4,Flash,32,Intel,HD Graphics 500,Windows 10,1.5,1488.75
158,Dell,Inspiron 5579,2 in 1 Convertible,15.6,Full HD / Touchscreen 1920x1080,Intel,Core i7 8550U,1.8,8,SSD,256,Intel,UHD Graphics 620,Windows 10,1.56,4164.53


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              1275 non-null   object 
 1   Product              1275 non-null   object 
 2   TypeName             1275 non-null   object 
 3   Inches               1275 non-null   float64
 4   ScreenResolution     1275 non-null   object 
 5   CPU_Company          1275 non-null   object 
 6   CPU_Type             1275 non-null   object 
 7   CPU_Frequency (GHz)  1275 non-null   float64
 8   RAM (GB)             1275 non-null   int64  
 9   Memory Type          1275 non-null   object 
 10  Memory Capacity      1275 non-null   int64  
 11  GPU_Company          1275 non-null   object 
 12  GPU_Type             1275 non-null   object 
 13  OpSys                1275 non-null   object 
 14  Weight (kg)          1275 non-null   float64
 15  Price (SAR)          1275 non-null   f

In [47]:
data['Memory Type'].unique()

array(['SSD', 'Flash', 'HDD', 'SSD+HDD', 'SSD+SSD', 'Hybrid', 'Flash+HDD',
       'HDD+HDD', 'SSD+Hybrid'], dtype=object)

## ScreenResolution splitting

In [48]:
data[['Resolution_Width', 'Resolution_Height']] = data['ScreenResolution'].str.extract(r'(\d{3,4})x(\d{3,4})')


data['Resolution_Width'] = data['Resolution_Width'].astype(int)
data['Resolution_Height'] = data['Resolution_Height'].astype(int)


data['Contains_HD'] = data['ScreenResolution'].str.contains('HD', case=False).astype(int)
data['Contains_IPS'] = data['ScreenResolution'].str.contains('IPS', case=False).astype(int)
data['Contains_Touchscreen'] = data['ScreenResolution'].str.contains('Touchscreen', case=False).astype(int)


data['Contains_4K'] = data['ScreenResolution'].str.contains('4K', case=False).astype(int)
data['Contains_Quad_HD_plus'] = data['ScreenResolution'].str.contains('Quad HD+', case=False).astype(int)

In [49]:
data.sample(10)

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,CPU_Company,CPU_Type,CPU_Frequency (GHz),RAM (GB),Memory Type,Memory Capacity,GPU_Company,GPU_Type,OpSys,Weight (kg),Price (SAR),Resolution_Width,Resolution_Height,Contains_HD,Contains_IPS,Contains_Touchscreen,Contains_4K,Contains_Quad_HD_plus
1055,HP,ProBook 650,Notebook,15.6,1366x768,Intel,Core i3 6100U,2.3,4,HDD,500,Intel,HD Graphics 520,Windows 10,2.31,2799.4455,1366,768,0,0,0,0,0
1065,Dell,XPS 13,Ultrabook,13.3,Full HD 1920x1080,Intel,Core i7 7500U,2.7,8,SSD,256,Intel,HD Graphics 620,Windows 10,1.29,5788.26,1920,1080,1,0,0,0,0
913,HP,ProBook 450,Notebook,15.6,Full HD 1920x1080,Intel,Core i7 7500U,2.7,8,SSD,256,Nvidia,GeForce 930MX,Windows 10,2.04,4045.3903,1920,1080,1,0,0,0,0
339,Dell,Inspiron 7570,Notebook,15.6,Full HD 1920x1080,Intel,Core i5 8250U,1.6,8,SSD,256,Nvidia,GeForce 940MX,Windows 10,2.0,4536.7175,1920,1080,1,0,0,0,0
178,Lenovo,V310-15IKB (i5-7200U/8GB/1TB,Notebook,15.6,Full HD 1920x1080,Intel,Core i5 7200U,2.5,8,SSD+HDD,1152,AMD,R17M-M1-70,Windows 10,1.9,3227.61,1920,1080,1,0,0,0,0
835,Dell,Alienware 15,Gaming,15.6,Full HD 1920x1080,Intel,Core i7 7700HQ,2.8,16,SSD+HDD,1280,Nvidia,GeForce GTX 1070,Windows 10,3.21,11015.2811,1920,1080,1,0,0,0,0
1234,Apple,MacBook Air,Ultrabook,11.6,1366x768,Intel,Core i5,1.6,4,Flash,256,Intel,HD Graphics 6000,Mac OS X,1.08,3807.23,1366,768,0,0,0,0,0
389,Lenovo,ThinkPad X1,2 in 1 Convertible,14.0,Touchscreen 2560x1440,Intel,Core i7 7500U,2.7,16,SSD,512,Intel,HD Graphics 620,Windows 10,1.42,9960.73,2560,1440,0,0,1,0,0
193,Lenovo,IdeaPad 320-17IKBR,Notebook,17.3,1600x900,Intel,Core i5 8250U,1.6,8,SSD,256,Nvidia,GeForce MX150,No OS,2.8,2771.06,1600,900,0,0,0,0,0
103,HP,Envy 13-ad009n,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel,Core i7 7500U,2.7,8,SSD,256,Nvidia,GeForce MX150,Windows 10,1.38,4442.43,1920,1080,1,1,0,0,0


## CPU

In [50]:
import re

# Define functions for more precise extraction
def extract_family(cpu_type, company):
    if company == 'Intel':
        return re.search(r'^(Core|Xeon|Pentium|Celeron|Atom|Core M)', cpu_type, re.IGNORECASE).group(1) if re.search(r'^(Core|Xeon|Pentium|Celeron|Atom|Core M)', cpu_type, re.IGNORECASE) else 'Unknown'
    elif company == 'AMD':
        return re.search(r'^(Ryzen|A[0-9]|FX|Athlon|E[0-9]|Pro|Sempron)', cpu_type, re.IGNORECASE).group(1) if re.search(r'^(Ryzen|A[0-9]|FX|Athlon|E[0-9]|Pro|Sempron)', cpu_type, re.IGNORECASE) else 'Unknown'
    elif company == 'Samsung':
        return re.search(r'^(Exynos)', cpu_type, re.IGNORECASE).group(1) if re.search(r'^(Exynos)', cpu_type, re.IGNORECASE) else 'Unknown'
    return 'Unknown'

def extract_generation(cpu_type, company):
    if company == 'Intel':
        match = re.search(r'(\d{4,5}[A-Za-z]*)$', cpu_type)
        return match.group(1)[:1] if match else 'Unknown'  # First digit for generation
    elif company == 'AMD':
        match = re.search(r'(\d{4,5})$', cpu_type)
        return match.group(1)[:1] if match else 'Unknown'
    elif company == 'Samsung':
        match = re.search(r'Exynos (\d+)', cpu_type, re.IGNORECASE)
        return match.group(1)[:1] if match else 'Unknown'
    return 'Unknown'

def extract_series(cpu_type, company):
    if company == 'Intel':
        # Extract Intel series like i3, i5, i7, or similar
        match = re.search(r'(i3|i5|i7|i9|m3|m5)', cpu_type, re.IGNORECASE)
        return match.group(1) if match else 'Other'
    elif company == 'AMD':
        # Extract the numeric series number after the family name
        match = re.search(r'(?:A[0-9]-Series|Ryzen \d|FX|Athlon|E[0-9])[^\d]*(\d+)', cpu_type, re.IGNORECASE)
        return match.group(1) if match else 'Other'
    elif company == 'Samsung':
        # Extract the Exynos series number
        match = re.search(r'Exynos (\d+)', cpu_type, re.IGNORECASE)
        return match.group(1) if match else 'Other'
    return 'Other'


# Apply the refined functions
data['CPU_Family'] = data.apply(lambda row: extract_family(row['CPU_Type'], row['CPU_Company']), axis=1)
data['CPU_Generation'] = data.apply(lambda row: extract_generation(row['CPU_Type'], row['CPU_Company']), axis=1)
data['CPU_Series'] = data.apply(lambda row: extract_series(row['CPU_Type'], row['CPU_Company']), axis=1)


In [51]:
data['CPU_Series'] = data.groupby(['CPU_Family', 'CPU_Generation'])['CPU_Series'].transform(
    lambda x: x.replace('Other', x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [52]:
data['CPU_Generation'] = data.groupby(['CPU_Family', 'CPU_Series'])['CPU_Generation'].transform(
    lambda x: x.replace('Unknown', x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [53]:
data['CPU_Family'] = data.groupby(['CPU_Generation', 'CPU_Series'])['CPU_Family'].transform(
    lambda x: x.replace('Unknown', x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [54]:
data=data.drop(['CPU_Type', 'CPU_Company'],axis=1)
data.head()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,CPU_Frequency (GHz),RAM (GB),Memory Type,Memory Capacity,GPU_Company,GPU_Type,OpSys,Weight (kg),Price (SAR),Resolution_Width,Resolution_Height,Contains_HD,Contains_IPS,Contains_Touchscreen,Contains_4K,Contains_Quad_HD_plus,CPU_Family,CPU_Generation,CPU_Series
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,2.3,8,SSD,128,Intel,Iris Plus Graphics 640,macOS,1.37,5318.5693,2560,1600,0,1,0,0,0,Core,7,i5
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,1.8,8,Flash,128,Intel,HD Graphics 6000,macOS,1.34,3568.7918,1440,900,0,0,0,0,0,Core,7,i5
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,2.5,8,SSD,256,Intel,HD Graphics 620,No OS,1.86,2282.75,1920,1080,1,0,0,0,0,Core,7,i5
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,2.7,16,SSD,512,AMD,Radeon Pro 455,macOS,1.83,10073.6765,2880,1800,0,1,0,0,0,Core,7,i7
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,3.1,8,SSD,256,Intel,Iris Plus Graphics 650,macOS,1.37,7160.292,2560,1600,0,1,0,0,0,Core,7,i5


## GPU 

In [55]:
data['GPU_Family'] = data['GPU_Type'].apply(lambda x: x.split(' ')[0])
data['GPU_Series'] = data['GPU_Type'].apply(lambda x: x.split(' ')[-1])

In [56]:
data['GPU_Series_Clean'] = data['GPU_Series'].str.extract('(\d+)').fillna(data['GPU_Series'])

  data['GPU_Series_Clean'] = data['GPU_Series'].str.extract('(\d+)').fillna(data['GPU_Series'])


In [57]:
data['Series_Type'] = data['GPU_Series'].apply(lambda x: 'Numeric' if x.isdigit() else 'Alphanumeric')
data['GPU_Series_Clean'] = pd.to_numeric(data['GPU_Series_Clean'], errors='coerce')

In [58]:
# Assign Performance Tier
def assign_tier(series):
    if pd.isna(series):
        return 'Unknown'
    elif series <= 500:
        return 'Low-End'
    elif 500 < series <= 800:
        return 'Mid-Range'
    else:
        return 'High-End'

In [59]:
data['Performance_Tier'] = data['GPU_Series_Clean'].apply(assign_tier)

In [60]:
data=data.drop(['ScreenResolution'], axis=1) 

In [61]:
data.columns

Index(['Company', 'Product', 'TypeName', 'Inches', 'CPU_Frequency (GHz)',
       'RAM (GB)', 'Memory Type', 'Memory Capacity', 'GPU_Company', 'GPU_Type',
       'OpSys', 'Weight (kg)', 'Price (SAR)', 'Resolution_Width',
       'Resolution_Height', 'Contains_HD', 'Contains_IPS',
       'Contains_Touchscreen', 'Contains_4K', 'Contains_Quad_HD_plus',
       'CPU_Family', 'CPU_Generation', 'CPU_Series', 'GPU_Family',
       'GPU_Series', 'GPU_Series_Clean', 'Series_Type', 'Performance_Tier'],
      dtype='object')

In [62]:
len(data.columns)

28

In [63]:
data.sample(10)

Unnamed: 0,Company,Product,TypeName,Inches,CPU_Frequency (GHz),RAM (GB),Memory Type,Memory Capacity,GPU_Company,GPU_Type,OpSys,Weight (kg),Price (SAR),Resolution_Width,Resolution_Height,Contains_HD,Contains_IPS,Contains_Touchscreen,Contains_4K,Contains_Quad_HD_plus,CPU_Family,CPU_Generation,CPU_Series,GPU_Family,GPU_Series,GPU_Series_Clean,Series_Type,Performance_Tier
405,Asus,Zenbook 3,Ultrabook,14.0,2.7,8,SSD,512,Intel,HD Graphics 620,Windows 10,1.1,7435.81,1920,1080,1,0,0,0,0,Core,7,i7,HD,620,620,Numeric,Mid-Range
1117,Razer,Blade Stealth,Ultrabook,12.5,2.5,8,SSD,256,Intel,HD Graphics 520,Windows 10,1.25,4085.13,3840,2160,1,0,1,1,0,Core,6,i7,HD,520,520,Numeric,Mid-Range
970,Dell,XPS 13,2 in 1 Convertible,13.3,1.3,16,SSD,512,Intel,HD Graphics 615,Windows 10,1.22,7539.03,3200,1800,1,0,1,0,1,Core,7,i7,HD,615,615,Numeric,Mid-Range
620,HP,ProBook 650,Notebook,14.0,2.6,8,SSD,256,Intel,HD Graphics 620,Windows 10,2.31,4462.28,1366,768,0,0,0,0,0,Core,7,i5,HD,620,620,Numeric,Mid-Range
528,Dell,Inspiron 3567,Notebook,15.6,2.5,4,HDD,500,AMD,Radeon R5 M430,Windows 10,2.3,2322.45,1920,1080,1,0,0,0,0,Core,7,i5,Radeon,M430,430,Alphanumeric,Low-End
1120,Vero,V131 (X5-Z8350/4GB/32GB/FHD/W10),Notebook,13.3,1.44,4,Flash,32,Intel,HD Graphics 400,Windows 10,1.35,778.12,1920,1080,1,0,0,0,0,Atom,8,Other,HD,400,400,Numeric,Low-End
626,Mediacom,SmartBook 140,Notebook,14.0,1.44,2,Flash,32,Intel,HD Graphics,Windows 10,1.4,948.83,1920,1080,1,0,0,0,0,Atom,8,Other,HD,Graphics,640,Alphanumeric,Mid-Range
1186,Dell,Inspiron 5578,2 in 1 Convertible,15.6,2.7,16,SSD,512,Intel,HD Graphics 620,Windows 10,2.09,4680.63,1920,1080,1,0,1,0,0,Core,7,i7,HD,620,620,Numeric,Mid-Range
82,HP,ProBook 440,Notebook,14.0,1.6,8,SSD,256,Intel,HD Graphics 620,Windows 10,1.63,3414.2,1920,1080,1,0,0,0,0,Core,8,i5,HD,620,620,Numeric,Mid-Range
219,Dell,Inspiron 5379,2 in 1 Convertible,13.3,1.8,16,SSD,512,Intel,UHD Graphics 620,Windows 10,1.62,5077.63,1920,1080,1,0,1,0,0,Core,8,i7,UHD,620,620,Numeric,Mid-Range


In [64]:
pd.set_option('display.max_columns', 500)


# Save data

In [65]:

data.to_csv('../Data/featured_dataset.csv', index=False)