# Import Libraries & dataset

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
data  = pd.read_csv('../Data/laptop_price - dataset.csv')

# Basic Data Exploration

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              1275 non-null   object 
 1   Product              1275 non-null   object 
 2   TypeName             1275 non-null   object 
 3   Inches               1275 non-null   float64
 4   ScreenResolution     1275 non-null   object 
 5   CPU_Company          1275 non-null   object 
 6   CPU_Type             1275 non-null   object 
 7   CPU_Frequency (GHz)  1275 non-null   float64
 8   RAM (GB)             1275 non-null   int64  
 9   Memory               1275 non-null   object 
 10  GPU_Company          1275 non-null   object 
 11  GPU_Type             1275 non-null   object 
 12  OpSys                1275 non-null   object 
 13  Weight (kg)          1275 non-null   float64
 14  Price (Euro)         1275 non-null   float64
dtypes: float64(4), int64(1), object(10)
me

In [4]:
data.describe()

Unnamed: 0,Inches,CPU_Frequency (GHz),RAM (GB),Weight (kg),Price (Euro)
count,1275.0,1275.0,1275.0,1275.0,1275.0
mean,15.022902,2.30298,8.440784,2.040525,1134.969059
std,1.42947,0.503846,5.097809,0.669196,700.752504
min,10.1,0.9,2.0,0.69,174.0
25%,14.0,2.0,4.0,1.5,609.0
50%,15.6,2.5,8.0,2.04,989.0
75%,15.6,2.7,8.0,2.31,1496.5
max,18.4,3.6,64.0,4.7,6099.0


In [5]:
for col in data.select_dtypes(include=['object']).columns:
  print(f"Unique values for column '{col}':")
  print(data[col].unique())
  print("-" * 20)

Unique values for column 'Company':
['Apple' 'HP' 'Acer' 'Asus' 'Dell' 'Lenovo' 'Chuwi' 'MSI' 'Microsoft'
 'Toshiba' 'Huawei' 'Xiaomi' 'Vero' 'Razer' 'Mediacom' 'Samsung' 'Google'
 'Fujitsu' 'LG']
--------------------
Unique values for column 'Product':
['MacBook Pro' 'Macbook Air' '250 G6' 'Aspire 3' 'ZenBook UX430UN'
 'Swift 3' 'Inspiron 3567' 'MacBook 12"' 'IdeaPad 320-15IKB' 'XPS 13'
 'Vivobook E200HA' 'Legion Y520-15IKBN' '255 G6' 'Inspiron 5379'
 '15-BS101nv (i7-8550U/8GB/256GB/FHD/W10)' 'MacBook Air' 'Inspiron 5570'
 'Latitude 5590' 'ProBook 470' 'LapBook 15.6"'
 'E402WA-GA010T (E2-6110/2GB/32GB/W10)'
 '17-ak001nv (A6-9220/4GB/500GB/Radeon' 'IdeaPad 120S-14IAP'
 'Inspiron 5770' 'ProBook 450' 'X540UA-DM186 (i3-6006U/4GB/1TB/FHD/Linux)'
 'Inspiron 7577' 'X542UQ-GO005 (i5-7200U/8GB/1TB/GeForce'
 'Aspire A515-51G' 'Inspiron 7773' 'IdeaPad 320-15ISK' 'Rog Strix'
 'X751NV-TY001T (N4200/4GB/1TB/GeForce' 'Yoga Book' 'ProBook 430'
 'Inspiron 3576' '15-bs002nv (i3-6006U/4GB/128GB/FHD/W10)

In [6]:
data.duplicated().any()

np.False_

In [7]:
data.isnull().sum() / len(data) *100

Company                0.0
Product                0.0
TypeName               0.0
Inches                 0.0
ScreenResolution       0.0
CPU_Company            0.0
CPU_Type               0.0
CPU_Frequency (GHz)    0.0
RAM (GB)               0.0
Memory                 0.0
GPU_Company            0.0
GPU_Type               0.0
OpSys                  0.0
Weight (kg)            0.0
Price (Euro)           0.0
dtype: float64

# Feature Engneering

## Converting Price

In [8]:
exchange_rate = 3.97  # 1 EUR to SAR

data['Price (SAR)'] = data['Price (Euro)'] * exchange_rate

data = data.drop('Price (Euro)', axis=1)

In [9]:
data.sample()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,CPU_Company,CPU_Type,CPU_Frequency (GHz),RAM (GB),Memory,GPU_Company,GPU_Type,OpSys,Weight (kg),Price (SAR)
863,Lenovo,Legion Y520-15IKBN,Gaming,15.6,IPS Panel Full HD 1920x1080,Intel,Core i5 7300HQ,2.5,8,1TB HDD,Nvidia,GeForce GTX 1050,No OS,2.5,4037.49


## Splitting Memory column

In [10]:
def memory_split(memory):
    try:
        if '+' in memory:
            split_mem = memory.split('+')
            mem1 = split_mem[0].strip()
            mem2 = split_mem[1].strip()
            mem_type = mem1.split(' ')[1] + '+' + mem2.split(' ')[1]
            mem1_capacity = int(re.findall(r'\d+', mem1)[0])
            mem2_capacity = int(re.findall(r'\d+', mem2)[0])
            if 'GB' in mem1 and 'GB' in mem2:
              total_capacity = mem1_capacity + mem2_capacity
            elif 'TB' in mem1 and 'GB' in mem2:
              total_capacity = mem1_capacity * 1024 + mem2_capacity
            elif 'GB' in mem1 and 'TB' in mem2:
              total_capacity = mem1_capacity + mem2_capacity * 1024
            else:
              total_capacity = mem1_capacity * 1024 + mem2_capacity * 1024
            return total_capacity, mem_type
        else:
            capacity = re.findall(r'\d+', memory)[0]
            mem_type = memory.split(' ')[1]
            if 'TB' in memory:
              capacity = int(capacity) * 1024
            return int(capacity), mem_type

    except:
        return np.nan, np.nan


data['Memory Capacity'], data['Memory Type'] = zip(*data['Memory'].apply(memory_split))

In [11]:
memory_cols = ['Memory Capacity', 'Memory Type']
memory_index = data.columns.get_loc('Memory')
for col in memory_cols:
    data.insert(memory_index + 1, col, data.pop(col))
data.drop(['Memory'],axis= 1,inplace=True)

In [12]:
data.sample(10)

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,CPU_Company,CPU_Type,CPU_Frequency (GHz),RAM (GB),Memory Type,Memory Capacity,GPU_Company,GPU_Type,OpSys,Weight (kg),Price (SAR)
894,MSI,GS73VR Stealth,Gaming,17.3,IPS Panel 4K Ultra HD 3840x2160,Intel,Core i7 6700HQ,2.6,16,SSD+HDD,2560,Nvidia,GeForce GTX 1060,Windows 10,2.43,10516.53
44,Dell,Inspiron 7773,2 in 1 Convertible,17.3,Full HD / Touchscreen 1920x1080,Intel,Core i5 8250U,1.6,12,HDD,1024,Nvidia,GeForce 150MX,Windows 10,2.77,3966.03
1133,HP,250 G6,Ultrabook,15.6,Full HD 1920x1080,Intel,Core i7 7500U,2.7,8,SSD,256,Intel,HD Graphics 620,Windows 10,1.84,2985.44
348,Asus,VivoBook Flip,2 in 1 Convertible,11.6,Touchscreen 1366x768,Intel,Celeron Dual Core N3350,1.1,4,Flash,32,Intel,HD Graphics 500,Windows 10,1.5,1488.75
1099,Asus,G752VY-GC162T (i7-6700HQ/16GB/1TB,Gaming,17.3,IPS Panel Full HD 1920x1080,Intel,Core i7 6700HQ,2.6,16,SSD+HDD,1152,Nvidia,GeForce GTX 980M,Windows 10,4.3,9127.03
105,HP,Pavilion 14-BK001nv,Notebook,14.0,IPS Panel Full HD 1920x1080,Intel,Core i5 7200U,2.5,6,SSD,256,Nvidia,GeForce 940MX,Windows 10,1.58,2616.23
321,Lenovo,IdeaPad 320-17IKB,Notebook,17.3,1600x900,Intel,Core i5 7200U,2.5,4,HDD,1024,Nvidia,GeForce 920MX,Windows 10,2.79,2338.33
360,HP,250 G6,Notebook,15.6,1366x768,Intel,Core i3 6006U,2.0,4,HDD,500,Intel,HD Graphics 520,Windows 10,1.86,1742.83
611,Dell,Inspiron 7559,Gaming,15.6,Full HD 1920x1080,Intel,Core i7 6700HQ,2.6,16,HDD,1024,Nvidia,GeForce GTX 960<U+039C>,Windows 10,2.59,3489.6697
143,Toshiba,Satellite Pro,Notebook,15.6,1366x768,Intel,Core i5 6200U,2.3,4,HDD,500,Intel,HD Graphics 520,Windows 10,2.1,2389.94


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              1275 non-null   object 
 1   Product              1275 non-null   object 
 2   TypeName             1275 non-null   object 
 3   Inches               1275 non-null   float64
 4   ScreenResolution     1275 non-null   object 
 5   CPU_Company          1275 non-null   object 
 6   CPU_Type             1275 non-null   object 
 7   CPU_Frequency (GHz)  1275 non-null   float64
 8   RAM (GB)             1275 non-null   int64  
 9   Memory Type          1275 non-null   object 
 10  Memory Capacity      1275 non-null   int64  
 11  GPU_Company          1275 non-null   object 
 12  GPU_Type             1275 non-null   object 
 13  OpSys                1275 non-null   object 
 14  Weight (kg)          1275 non-null   float64
 15  Price (SAR)          1275 non-null   f

In [14]:
data['Memory Type'].unique()

array(['SSD', 'Flash', 'HDD', 'SSD+HDD', 'SSD+SSD', 'Hybrid', 'Flash+HDD',
       'HDD+HDD', 'SSD+Hybrid'], dtype=object)

## ScreenResolution splitting

In [15]:
data[['Resolution_Width', 'Resolution_Height']] = data['ScreenResolution'].str.extract(r'(\d{3,4})x(\d{3,4})')


data['Resolution_Width'] = data['Resolution_Width'].astype(int)
data['Resolution_Height'] = data['Resolution_Height'].astype(int)


data['Contains_HD'] = data['ScreenResolution'].str.contains('HD', case=False).astype(int)
data['Contains_IPS'] = data['ScreenResolution'].str.contains('IPS', case=False).astype(int)
data['Contains_Touchscreen'] = data['ScreenResolution'].str.contains('Touchscreen', case=False).astype(int)


data['Contains_4K'] = data['ScreenResolution'].str.contains('4K', case=False).astype(int)
data['Contains_Quad_HD_plus'] = data['ScreenResolution'].str.contains('Quad HD+', case=False).astype(int)

In [16]:
data.sample(10)

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,CPU_Company,CPU_Type,CPU_Frequency (GHz),RAM (GB),Memory Type,...,OpSys,Weight (kg),Price (SAR),Resolution_Width,Resolution_Height,Contains_HD,Contains_IPS,Contains_Touchscreen,Contains_4K,Contains_Quad_HD_plus
137,Asus,FX753VD-GC086T (i5-7300HQ/8GB/1TB,Gaming,17.3,Full HD 1920x1080,Intel,Core i5 7300HQ,2.5,8,SSD+HDD,...,Windows 10,3.0,3723.86,1920,1080,1,0,0,0,0
1186,Dell,Inspiron 5578,2 in 1 Convertible,15.6,Full HD / Touchscreen 1920x1080,Intel,Core i7 7500U,2.7,16,SSD,...,Windows 10,2.09,4680.63,1920,1080,1,0,1,0,0
1037,Toshiba,Tecra Z40-C-136,Ultrabook,14.0,IPS Panel Full HD 1920x1080,Intel,Core i7 6600U,2.6,8,SSD,...,Windows 10,1.47,6848.25,1920,1080,1,1,0,0,0
70,Microsoft,Surface Laptop,Ultrabook,13.5,Touchscreen 2256x1504,Intel,Core i5 7200U,2.5,4,SSD,...,Windows 10 S,1.252,4323.33,2256,1504,0,0,1,0,0
400,Lenovo,IdeaPad 320-15IAP,Notebook,15.6,1366x768,Intel,Celeron Dual Core N3350,1.1,4,HDD,...,Windows 10,2.2,1214.82,1366,768,0,0,0,0,0
135,HP,Probook 440,Notebook,14.0,Full HD 1920x1080,Intel,Core i7 8550U,1.8,8,SSD,...,Windows 10,1.63,3922.36,1920,1080,1,0,0,0,0
225,Acer,Aspire E5-576G,Notebook,15.6,Full HD 1920x1080,Intel,Core i7 7500U,2.7,8,SSD,...,Windows 10,2.2,3303.04,1920,1080,1,0,0,0,0
670,Microsoft,Surface Laptop,Ultrabook,13.5,Touchscreen 2256x1504,Intel,Core i7 7600U,2.8,8,SSD,...,Windows 10 S,1.252,7415.3645,2256,1504,0,0,1,0,0
133,Acer,Aspire A515-51G-37JS,Notebook,15.6,Full HD 1920x1080,Intel,Core i3 7130U,2.7,4,SSD,...,Windows 10,2.2,2270.84,1920,1080,1,0,0,0,0
251,Asus,ROG G752VSK-GC493T,Gaming,17.3,Full HD 1920x1080,Intel,Core i7 7700HQ,2.8,16,SSD+HDD,...,Windows 10,4.3,7142.03,1920,1080,1,0,0,0,0


## CPU

In [17]:
import re

# Define functions for more precise extraction
def extract_family(cpu_type, company):
    if company == 'Intel':
        return re.search(r'^(Core|Xeon|Pentium|Celeron|Atom|Core M)', cpu_type, re.IGNORECASE).group(1) if re.search(r'^(Core|Xeon|Pentium|Celeron|Atom|Core M)', cpu_type, re.IGNORECASE) else 'Unknown'
    elif company == 'AMD':
        return re.search(r'^(Ryzen|A[0-9]|FX|Athlon|E[0-9]|Pro|Sempron)', cpu_type, re.IGNORECASE).group(1) if re.search(r'^(Ryzen|A[0-9]|FX|Athlon|E[0-9]|Pro|Sempron)', cpu_type, re.IGNORECASE) else 'Unknown'
    elif company == 'Samsung':
        return re.search(r'^(Exynos)', cpu_type, re.IGNORECASE).group(1) if re.search(r'^(Exynos)', cpu_type, re.IGNORECASE) else 'Unknown'
    return 'Unknown'

def extract_generation(cpu_type, company):
    if company == 'Intel':
        match = re.search(r'(\d{4,5}[A-Za-z]*)$', cpu_type)
        return match.group(1)[:1] if match else 'Unknown'  # First digit for generation
    elif company == 'AMD':
        match = re.search(r'(\d{4,5})$', cpu_type)
        return match.group(1)[:1] if match else 'Unknown'
    elif company == 'Samsung':
        match = re.search(r'Exynos (\d+)', cpu_type, re.IGNORECASE)
        return match.group(1)[:1] if match else 'Unknown'
    return 'Unknown'

def extract_series(cpu_type, company):
    if company == 'Intel':
        # Extract Intel series like i3, i5, i7, or similar
        match = re.search(r'(i3|i5|i7|i9|m3|m5)', cpu_type, re.IGNORECASE)
        return match.group(1) if match else 'Other'
    elif company == 'AMD':
        # Extract the numeric series number after the family name
        match = re.search(r'(?:A[0-9]-Series|Ryzen \d|FX|Athlon|E[0-9])[^\d]*(\d+)', cpu_type, re.IGNORECASE)
        return match.group(1) if match else 'Other'
    elif company == 'Samsung':
        # Extract the Exynos series number
        match = re.search(r'Exynos (\d+)', cpu_type, re.IGNORECASE)
        return match.group(1) if match else 'Other'
    return 'Other'


# Apply the refined functions
data['CPU_Family'] = data.apply(lambda row: extract_family(row['CPU_Type'], row['CPU_Company']), axis=1)
data['CPU_Generation'] = data.apply(lambda row: extract_generation(row['CPU_Type'], row['CPU_Company']), axis=1)
data['CPU_Series'] = data.apply(lambda row: extract_series(row['CPU_Type'], row['CPU_Company']), axis=1)


In [18]:
data['CPU_Series'] = data.groupby(['CPU_Family', 'CPU_Generation'])['CPU_Series'].transform(
    lambda x: x.replace('Other', x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [19]:
data['CPU_Generation'] = data.groupby(['CPU_Family', 'CPU_Series'])['CPU_Generation'].transform(
    lambda x: x.replace('Unknown', x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [20]:
data['CPU_Family'] = data.groupby(['CPU_Generation', 'CPU_Series'])['CPU_Family'].transform(
    lambda x: x.replace('Unknown', x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [21]:
data=data.drop(['CPU_Type', 'CPU_Company'],axis=1)
data.head()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,CPU_Frequency (GHz),RAM (GB),Memory Type,Memory Capacity,GPU_Company,...,Resolution_Width,Resolution_Height,Contains_HD,Contains_IPS,Contains_Touchscreen,Contains_4K,Contains_Quad_HD_plus,CPU_Family,CPU_Generation,CPU_Series
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,2.3,8,SSD,128,Intel,...,2560,1600,0,1,0,0,0,Core,7,i5
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,1.8,8,Flash,128,Intel,...,1440,900,0,0,0,0,0,Core,7,i5
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,2.5,8,SSD,256,Intel,...,1920,1080,1,0,0,0,0,Core,7,i5
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,2.7,16,SSD,512,AMD,...,2880,1800,0,1,0,0,0,Core,7,i7
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,3.1,8,SSD,256,Intel,...,2560,1600,0,1,0,0,0,Core,7,i5


## GPU 

In [22]:
data['GPU_Family'] = data['GPU_Type'].apply(lambda x: x.split(' ')[0])
data['GPU_Series'] = data['GPU_Type'].apply(lambda x: x.split(' ')[-1])

In [23]:
data['GPU_Series_Clean'] = data['GPU_Series'].str.extract('(\d+)').fillna(data['GPU_Series'])

  data['GPU_Series_Clean'] = data['GPU_Series'].str.extract('(\d+)').fillna(data['GPU_Series'])


In [24]:
data['Series_Type'] = data['GPU_Series'].apply(lambda x: 'Numeric' if x.isdigit() else 'Alphanumeric')
data['GPU_Series_Clean'] = pd.to_numeric(data['GPU_Series_Clean'], errors='coerce')

In [25]:
# Assign Performance Tier
def assign_tier(series):
    if pd.isna(series):
        return 'Unknown'
    elif series <= 500:
        return 'Low-End'
    elif 500 < series <= 800:
        return 'Mid-Range'
    else:
        return 'High-End'

In [26]:
data['Performance_Tier'] = data['GPU_Series_Clean'].apply(assign_tier)

In [27]:
data=data.drop(['ScreenResolution'], axis=1) 

In [28]:
data.columns

Index(['Company', 'Product', 'TypeName', 'Inches', 'CPU_Frequency (GHz)',
       'RAM (GB)', 'Memory Type', 'Memory Capacity', 'GPU_Company', 'GPU_Type',
       'OpSys', 'Weight (kg)', 'Price (SAR)', 'Resolution_Width',
       'Resolution_Height', 'Contains_HD', 'Contains_IPS',
       'Contains_Touchscreen', 'Contains_4K', 'Contains_Quad_HD_plus',
       'CPU_Family', 'CPU_Generation', 'CPU_Series', 'GPU_Family',
       'GPU_Series', 'GPU_Series_Clean', 'Series_Type', 'Performance_Tier'],
      dtype='object')

In [29]:
len(data.columns)

28

In [30]:
data.sample(10)

Unnamed: 0,Company,Product,TypeName,Inches,CPU_Frequency (GHz),RAM (GB),Memory Type,Memory Capacity,GPU_Company,GPU_Type,...,Contains_4K,Contains_Quad_HD_plus,CPU_Family,CPU_Generation,CPU_Series,GPU_Family,GPU_Series,GPU_Series_Clean,Series_Type,Performance_Tier
432,Lenovo,IdeaPad 320s-14IKB,Notebook,14.0,2.7,4,SSD,128,Intel,HD Graphics 620,...,0,0,Core,7,i3,HD,620,620,Numeric,Mid-Range
1181,Lenovo,IdeaPad 310-15IKB,Ultrabook,15.6,2.7,6,SSD,256,Nvidia,GeForce 920M,...,0,0,Core,7,i7,GeForce,920M,920,Alphanumeric,High-End
21,Lenovo,Legion Y520-15IKBN,Gaming,15.6,2.5,8,SSD+HDD,1152,Nvidia,GeForce GTX 1050,...,0,0,Core,7,i5,GeForce,1050,1050,Numeric,High-End
602,Acer,Aspire 1,Notebook,14.0,1.1,4,Flash,32,Intel,HD Graphics 500,...,0,0,Celeron,3,Other,HD,500,500,Numeric,Low-End
69,Asus,FX753VE-GC093 (i7-7700HQ/12GB/1TB/GeForce,Gaming,17.3,2.8,12,HDD,1024,Nvidia,GeForce GTX 1050 Ti,...,0,0,Core,7,i7,GeForce,Ti,640,Alphanumeric,Mid-Range
1130,HP,15-bs078cl (i7-7500U/8GB/2TB/W10),Notebook,15.6,2.7,8,HDD,2048,Intel,HD Graphics 620,...,0,0,Core,7,i7,HD,620,620,Numeric,Mid-Range
468,HP,Omen 17-an006nv,Gaming,17.3,2.8,12,HDD,1024,Nvidia,GeForce GTX 1060,...,0,0,Core,7,i7,GeForce,1060,1060,Numeric,High-End
848,HP,EliteBook x360,2 in 1 Convertible,13.3,2.5,8,SSD,256,Intel,HD Graphics 620,...,0,0,Core,7,i5,HD,620,620,Numeric,Mid-Range
561,Lenovo,V330-15IKB (i3-7130U/4GB/128GB/FHD/W10),Notebook,15.6,2.7,4,SSD,128,Intel,HD Graphics 620,...,0,0,Core,7,i3,HD,620,620,Numeric,Mid-Range
642,Lenovo,ThinkPad X1,Ultrabook,14.0,2.7,16,SSD,512,Intel,HD Graphics 620,...,0,0,Core,7,i7,HD,620,620,Numeric,Mid-Range


In [38]:
pd.set_option('display.max_columns', 500)

In [42]:
data

Unnamed: 0,Company,Product,TypeName,Inches,CPU_Frequency (GHz),RAM (GB),Memory Type,Memory Capacity,GPU_Company,GPU_Type,OpSys,Weight (kg),Price (SAR),Resolution_Width,Resolution_Height,Contains_HD,Contains_IPS,Contains_Touchscreen,Contains_4K,Contains_Quad_HD_plus,CPU_Family,CPU_Generation,CPU_Series,GPU_Family,GPU_Series,GPU_Series_Clean,Series_Type,Performance_Tier,Old_Generations,Unknown_Generation
0,Apple,MacBook Pro,Ultrabook,13.3,2.3,8,SSD,128,Intel,Iris Plus Graphics 640,macOS,1.37,5318.5693,2560,1600,0,1,0,0,0,Core,7,i5,Iris,640,640,Numeric,Mid-Range,,
1,Apple,Macbook Air,Ultrabook,13.3,1.8,8,Flash,128,Intel,HD Graphics 6000,macOS,1.34,3568.7918,1440,900,0,0,0,0,0,Core,7,i5,HD,6000,6000,Numeric,High-End,,
2,HP,250 G6,Notebook,15.6,2.5,8,SSD,256,Intel,HD Graphics 620,No OS,1.86,2282.7500,1920,1080,1,0,0,0,0,Core,7,i5,HD,620,620,Numeric,Mid-Range,,
3,Apple,MacBook Pro,Ultrabook,15.4,2.7,16,SSD,512,AMD,Radeon Pro 455,macOS,1.83,10073.6765,2880,1800,0,1,0,0,0,Core,7,i7,Radeon,455,455,Numeric,Low-End,,
4,Apple,MacBook Pro,Ultrabook,13.3,3.1,8,SSD,256,Intel,Iris Plus Graphics 650,macOS,1.37,7160.2920,2560,1600,0,1,0,0,0,Core,7,i5,Iris,650,650,Numeric,Mid-Range,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,2.5,4,SSD,128,Intel,HD Graphics 520,Windows 10,1.80,2532.8600,1920,1080,1,1,1,0,0,Core,6,i7,HD,520,520,Numeric,Mid-Range,,
1271,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,2.5,16,SSD,512,Intel,HD Graphics 520,Windows 10,1.30,5951.0300,3200,1800,1,1,1,0,1,Core,6,i7,HD,520,520,Numeric,Mid-Range,,
1272,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1.6,2,Flash,64,Intel,HD Graphics,Windows 10,1.50,909.1300,1366,768,0,0,0,0,0,Celeron,,Other,HD,Graphics,640,Alphanumeric,Mid-Range,3,
1273,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,2.5,6,HDD,1024,AMD,Radeon R5 M330,Windows 10,2.19,3033.0800,1366,768,0,0,0,0,0,Core,6,i7,Radeon,M330,330,Alphanumeric,Low-End,,


In [34]:
# القيم الفريدة لكل عمود
print("CPU_Generation unique values:", data['CPU_Generation'].unique())
print("CPU_Series unique values:", data['CPU_Series'].unique())


CPU_Generation unique values: ['7' '9' '8' '6' 'Unknown' '3' '1' '4']
CPU_Series unique values: ['i5' 'i7' '9420' 'i3' 'm3' 'Other' '9000' '6110' '9220' '700' '9830' '6'
 '7410' '600' '9' 'M3' '7310' '7210' '8800' '9410']


In [35]:
import pandas as pd

# افترض أن لديك DataFrame باسم 'data'

# فصل الأجيال القديمة إلى عمود منفصل
old_generations = ['1', '2', '3', '4', '5']  # تحديد الأجيال القديمة
data['Old_Generations'] = data['CPU_Generation'].apply(lambda x: x if x in old_generations else None)

# فصل "Unknown" إلى عمود منفصل
data['Unknown_Generation'] = data['CPU_Generation'].apply(lambda x: x if x == 'Unknown' else None)

# إزالة الأجيال القديمة و"Unknown" من العمود الأصلي
data['CPU_Generation'] = data['CPU_Generation'].apply(lambda x: x if x not in old_generations and x != 'Unknown' else None)

# عرض البيانات بعد فصل الأعمدة
print(data[['CPU_Generation', 'Old_Generations', 'Unknown_Generation']].head())


  CPU_Generation Old_Generations Unknown_Generation
0              7            None               None
1              7            None               None
2              7            None               None
3              7            None               None
4              7            None               None


In [43]:
# تصنيف الأعمدة بناءً على النوع
categorical_columns = data.select_dtypes(include=['object']).columns
numeric_columns = data.select_dtypes(include=['number']).columns

print("Categorical Columns:", categorical_columns)
print("Numeric Columns:", numeric_columns)


Categorical Columns: Index(['Company', 'Product', 'TypeName', 'Memory Type', 'GPU_Company',
       'GPU_Type', 'OpSys', 'CPU_Family', 'CPU_Generation', 'CPU_Series',
       'GPU_Family', 'GPU_Series', 'Series_Type', 'Performance_Tier',
       'Old_Generations', 'Unknown_Generation'],
      dtype='object')
Numeric Columns: Index(['Inches', 'CPU_Frequency (GHz)', 'RAM (GB)', 'Memory Capacity',
       'Weight (kg)', 'Price (SAR)', 'Resolution_Width', 'Resolution_Height',
       'Contains_HD', 'Contains_IPS', 'Contains_Touchscreen', 'Contains_4K',
       'Contains_Quad_HD_plus', 'GPU_Series_Clean'],
      dtype='object')


In [None]:
import pandas as pd



        #One-Hot Encoding
one_hot_columns = ['Company', 'Product', 'TypeName', 'Memory Type', 'GPU_Company',
                   'GPU_Type', 'OpSys', 'CPU_Family', 'GPU_Family', 'GPU_Series',
                   'Series_Type', 'Performance_Tier', 'Old_Generations', 'Unknown_Generation']

# pd.get_dummies
data_encoded = pd.get_dummies(data, columns=one_hot_columns, drop_first=True)

#Order Encoding 'CPU_Generation' 'CPU_Series'
cpu_generation_order = {'Unknown': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '7': 8, '9': 10}
cpu_series_order = {'Other': 1, '600': 2, '700': 3, '9420': 4, 'M3': 5, 'M5': 6, 'i3': 7, 'i5': 8, 'i7': 9, 'i9': 10, }


# تطبيق Order Encoding
data_encoded['CPU_Generation'] = data_encoded['CPU_Generation'].map(cpu_generation_order)
data_encoded['CPU_Series'] = data_encoded['CPU_Series'].map(cpu_series_order)

# عرض البيانات بعد التغيير
print(data_encoded.head())


   Inches  CPU_Frequency (GHz)  RAM (GB)  Memory Capacity  Weight (kg)  \
0    13.3                  2.3         8              128         1.37   
1    13.3                  1.8         8              128         1.34   
2    15.6                  2.5         8              256         1.86   
3    15.4                  2.7        16              512         1.83   
4    13.3                  3.1         8              256         1.37   

   Price (SAR)  Resolution_Width  Resolution_Height  Contains_HD  \
0    5318.5693              2560               1600            0   
1    3568.7918              1440                900            0   
2    2282.7500              1920               1080            1   
3   10073.6765              2880               1800            0   
4    7160.2920              2560               1600            0   

   Contains_IPS  Contains_Touchscreen  Contains_4K  Contains_Quad_HD_plus  \
0             1                     0            0                   

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np



X = data_encoded.drop(columns=['Price (SAR)'])  
y = data_encoded['Price (SAR)']  


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# تدريب نموذج Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# التنبؤ بالنتائج
y_pred = model.predict(X_test)




y_pred = model.predict(X_test)

# حساب MSE
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

# حساب MAE
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

# حساب R² score
r2 = r2_score(y_test, y_pred)
print(f'R² Score: {r2}')



Mean Squared Error (MSE): 1000447.9888758012
Mean Absolute Error (MAE): 672.4564458505882
Root Mean Squared Error (RMSE): 1000.2239693567642
R² Score: 0.8721105086077962



# Save data

In [None]:

data.to_csv('../Data/featured_dataset.csv', index=False)