In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

feature engineering

In [2]:
data  = pd.read_csv('../Data/laptop_price - dataset.csv')

In [3]:
exchange_rate = 3.97  # 1 EUR to SAR

data['Price (SAR)'] = data['Price (Euro)'] * exchange_rate

data = data.drop(columns=['Price (Euro)' , 'Product'], axis=1)

splitting memory

In [4]:
def memory_split(memory):
    try:
        if '+' in memory:
            split_mem = memory.split('+')
            mem1 = split_mem[0].strip()
            mem2 = split_mem[1].strip()
            mem_type = mem1.split(' ')[1] + '+' + mem2.split(' ')[1]
            mem1_capacity = int(re.findall(r'\d+', mem1)[0])
            mem2_capacity = int(re.findall(r'\d+', mem2)[0])
            if 'GB' in mem1 and 'GB' in mem2:
              total_capacity = mem1_capacity + mem2_capacity
            elif 'TB' in mem1 and 'GB' in mem2:
              total_capacity = mem1_capacity * 1024 + mem2_capacity
            elif 'GB' in mem1 and 'TB' in mem2:
              total_capacity = mem1_capacity + mem2_capacity * 1024
            else:
              total_capacity = mem1_capacity * 1024 + mem2_capacity * 1024
            return total_capacity, mem_type
        else:
            capacity = re.findall(r'\d+', memory)[0]
            mem_type = memory.split(' ')[1]
            if 'TB' in memory:
              capacity = int(capacity) * 1024
            return int(capacity), mem_type

    except:
        return np.nan, np.nan


data['Memory Capacity'], data['Memory Type'] = zip(*data['Memory'].apply(memory_split))

In [5]:
memory_cols = ['Memory Capacity', 'Memory Type']
memory_index = data.columns.get_loc('Memory')
for col in memory_cols:
    data.insert(memory_index + 1, col, data.pop(col))
data.drop(['Memory'],axis= 1,inplace=True)

splitting screen resolution

In [6]:
data[['Resolution_Width', 'Resolution_Height']] = data['ScreenResolution'].str.extract(r'(\d{3,4})x(\d{3,4})')


data['Resolution_Width'] = data['Resolution_Width'].astype(int)
data['Resolution_Height'] = data['Resolution_Height'].astype(int)


data['Contains_HD'] = data['ScreenResolution'].str.contains('HD', case=False).astype(int)
data['Contains_IPS'] = data['ScreenResolution'].str.contains('IPS', case=False).astype(int)
data['Contains_Touchscreen'] = data['ScreenResolution'].str.contains('Touchscreen', case=False).astype(int)


data['Contains_4K'] = data['ScreenResolution'].str.contains('4K', case=False).astype(int)
data['Contains_Quad_HD_plus'] = data['ScreenResolution'].str.contains('Quad HD+', case=False).astype(int)

CPU

In [7]:
import re

# Define functions for more precise extraction
def extract_family(cpu_type, company):
    if company == 'Intel':
        return re.search(r'^(Core|Xeon|Pentium|Celeron|Atom|Core M)', cpu_type, re.IGNORECASE).group(1) if re.search(r'^(Core|Xeon|Pentium|Celeron|Atom|Core M)', cpu_type, re.IGNORECASE) else 'Unknown'
    elif company == 'AMD':
        return re.search(r'^(Ryzen|A[0-9]|FX|Athlon|E[0-9]|Pro|Sempron)', cpu_type, re.IGNORECASE).group(1) if re.search(r'^(Ryzen|A[0-9]|FX|Athlon|E[0-9]|Pro|Sempron)', cpu_type, re.IGNORECASE) else 'Unknown'
    elif company == 'Samsung':
        return re.search(r'^(Exynos)', cpu_type, re.IGNORECASE).group(1) if re.search(r'^(Exynos)', cpu_type, re.IGNORECASE) else 'Unknown'
    return 'Unknown'

def extract_generation(cpu_type, company):
    if company == 'Intel':
        match = re.search(r'(\d{4,5}[A-Za-z]*)$', cpu_type)
        return match.group(1)[:1] if match else 'Unknown'  # First digit for generation
    elif company == 'AMD':
        match = re.search(r'(\d{4,5})$', cpu_type)
        return match.group(1)[:1] if match else 'Unknown'
    elif company == 'Samsung':
        match = re.search(r'Exynos (\d+)', cpu_type, re.IGNORECASE)
        return match.group(1)[:1] if match else 'Unknown'
    return 'Unknown'

def extract_series(cpu_type, company):
    if company == 'Intel':
        # Extract Intel series like i3, i5, i7, or similar
        match = re.search(r'(i3|i5|i7|i9|m3|m5)', cpu_type, re.IGNORECASE)
        return match.group(1) if match else 'Other'
    elif company == 'AMD':
        # Extract the numeric series number after the family name
        match = re.search(r'(?:A[0-9]-Series|Ryzen \d|FX|Athlon|E[0-9])[^\d]*(\d+)', cpu_type, re.IGNORECASE)
        return match.group(1) if match else 'Other'
    elif company == 'Samsung':
        # Extract the Exynos series number
        match = re.search(r'Exynos (\d+)', cpu_type, re.IGNORECASE)
        return match.group(1) if match else 'Other'
    return 'Other'


# Apply the refined functions
data['CPU_Family'] = data.apply(lambda row: extract_family(row['CPU_Type'], row['CPU_Company']), axis=1)
data['CPU_Generation'] = data.apply(lambda row: extract_generation(row['CPU_Type'], row['CPU_Company']), axis=1)
data['CPU_Series'] = data.apply(lambda row: extract_series(row['CPU_Type'], row['CPU_Company']), axis=1)


In [8]:
data['CPU_Series'] = data.groupby(['CPU_Family', 'CPU_Generation'])['CPU_Series'].transform(
    lambda x: x.replace('Other', x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [9]:
data['CPU_Generation'] = data.groupby(['CPU_Family', 'CPU_Series'])['CPU_Generation'].transform(
    lambda x: x.replace('Unknown', x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [10]:
data['CPU_Family'] = data.groupby(['CPU_Generation', 'CPU_Series'])['CPU_Family'].transform(
    lambda x: x.replace('Unknown', x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [11]:
data=data.drop('CPU_Type',axis=1)


In [12]:
data.columns

Index(['Company', 'TypeName', 'Inches', 'ScreenResolution', 'CPU_Company',
       'CPU_Frequency (GHz)', 'RAM (GB)', 'Memory Type', 'Memory Capacity',
       'GPU_Company', 'GPU_Type', 'OpSys', 'Weight (kg)', 'Price (SAR)',
       'Resolution_Width', 'Resolution_Height', 'Contains_HD', 'Contains_IPS',
       'Contains_Touchscreen', 'Contains_4K', 'Contains_Quad_HD_plus',
       'CPU_Family', 'CPU_Generation', 'CPU_Series'],
      dtype='object')

splitting series/family/generation for better encoding

In [13]:
# Initialize new columns for extracted features
data['Intel_Series'] = None
data['AMD_Series'] = None

# Extract Intel and AMD specific series based on CPU_Company
data['Intel_Series'] = data.apply(
    lambda row: row['CPU_Series'] if row['CPU_Company'] == 'Intel' else None, axis=1
)
data['AMD_Series'] = data.apply(
    lambda row: row['CPU_Series'] if row['CPU_Company'] == 'AMD' else None, axis=1
)

# Display the updated dataset with the new columns
# import ace_tools as tools; tools.display_dataframe_to_user(name="CPU Series Feature Extraction", dataframe=data)


In [14]:
data.columns

Index(['Company', 'TypeName', 'Inches', 'ScreenResolution', 'CPU_Company',
       'CPU_Frequency (GHz)', 'RAM (GB)', 'Memory Type', 'Memory Capacity',
       'GPU_Company', 'GPU_Type', 'OpSys', 'Weight (kg)', 'Price (SAR)',
       'Resolution_Width', 'Resolution_Height', 'Contains_HD', 'Contains_IPS',
       'Contains_Touchscreen', 'Contains_4K', 'Contains_Quad_HD_plus',
       'CPU_Family', 'CPU_Generation', 'CPU_Series', 'Intel_Series',
       'AMD_Series'],
      dtype='object')

In [15]:
# Add a new column for Samsung Series
data['Samsung_Series'] = None

# Extract Samsung specific series based on CPU_Company
data['Samsung_Series'] = data.apply(
    lambda row: row['CPU_Series'] if row['CPU_Company'] == 'Samsung' else None, axis=1
)


In [16]:
data[data['Samsung_Series'].isna()]

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,CPU_Company,CPU_Frequency (GHz),RAM (GB),Memory Type,Memory Capacity,GPU_Company,...,Contains_IPS,Contains_Touchscreen,Contains_4K,Contains_Quad_HD_plus,CPU_Family,CPU_Generation,CPU_Series,Intel_Series,AMD_Series,Samsung_Series
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel,2.3,8,,,Intel,...,1,0,0,0,Core,7,i5,i5,,
1,Apple,Ultrabook,13.3,1440x900,Intel,1.8,8,,,Intel,...,0,0,0,0,Core,7,i5,i5,,
2,HP,Notebook,15.6,Full HD 1920x1080,Intel,2.5,8,,,Intel,...,0,0,0,0,Core,7,i5,i5,,
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel,2.7,16,,,AMD,...,1,0,0,0,Core,7,i7,i7,,
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel,3.1,8,,,Intel,...,1,0,0,0,Core,7,i5,i5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270,Lenovo,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel,2.5,4,,,Intel,...,1,1,0,0,Core,6,i7,i7,,
1271,Lenovo,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel,2.5,16,,,Intel,...,1,1,0,1,Core,6,i7,i7,,
1272,Lenovo,Notebook,14.0,1366x768,Intel,1.6,2,,,Intel,...,0,0,0,0,Celeron,3,Other,Other,,
1273,HP,Notebook,15.6,1366x768,Intel,2.5,6,,,AMD,...,0,0,0,0,Core,6,i7,i7,,


In [17]:
len(data.columns)

27

In [18]:
data.CPU_Family.unique()

array(['Core', 'A9', 'Atom', 'Unknown', 'A6', 'Celeron', 'Ryzen',
       'Pentium', 'FX', 'Xeon', 'A1', 'A8', 'A4'], dtype=object)

In [19]:
data[['CPU_Company', 'CPU_Family']].head(10)

Unnamed: 0,CPU_Company,CPU_Family
0,Intel,Core
1,Intel,Core
2,Intel,Core
3,Intel,Core
4,Intel,Core
5,AMD,A9
6,Intel,Core
7,Intel,Core
8,Intel,Core
9,Intel,Core


In [20]:
# Add new columns for CPU families and generations by company
data['Intel_Family'] = None
data['AMD_Family'] = None
data['Samsung_Family'] = None

data['Intel_Generation'] = None
data['AMD_Generation'] = None
data['Samsung_Generation'] = None

# Extract CPU families and generations based on CPU_Company
data['Intel_Family'] = data.apply(
    lambda row: row['CPU_Family'] if row['CPU_Company'] == 'Intel' else None, axis=1
)
data['AMD_Family'] = data.apply(
    lambda row: row['CPU_Family'] if row['CPU_Company'] == 'AMD' else None, axis=1
)
data['Samsung_Family'] = data.apply(
    lambda row: row['CPU_Family'] if row['CPU_Company'] == 'Samsung' else None, axis=1
)

data['Intel_Generation'] = data.apply(
    lambda row: row['CPU_Generation'] if row['CPU_Company'] == 'Intel' else None, axis=1
)
data['AMD_Generation'] = data.apply(
    lambda row: row['CPU_Generation'] if row['CPU_Company'] == 'AMD' else None, axis=1
)
data['Samsung_Generation'] = data.apply(
    lambda row: row['CPU_Generation'] if row['CPU_Company'] == 'Samsung' else None, axis=1
)

# Display the updated dataset with separated family and generation columns by company
# tools.display_dataframe_to_user(name="Updated CPU Family and Generation Feature Extraction", dataframe=data)


In [21]:
data[['Intel_Series', 'AMD_Series', 'Samsung_Series', 'Intel_Family',
       'AMD_Family', 'Samsung_Family', 'Intel_Generation', 'AMD_Generation',
       'Samsung_Generation']].head(30)

Unnamed: 0,Intel_Series,AMD_Series,Samsung_Series,Intel_Family,AMD_Family,Samsung_Family,Intel_Generation,AMD_Generation,Samsung_Generation
0,i5,,,Core,,,7,,
1,i5,,,Core,,,7,,
2,i5,,,Core,,,7,,
3,i7,,,Core,,,7,,
4,i5,,,Core,,,7,,
5,,9420.0,,,A9,,,9,
6,i7,,,Core,,,7,,
7,i5,,,Core,,,7,,
8,i7,,,Core,,,8,,
9,i5,,,Core,,,8,,


In [22]:
len(data.columns)

33

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Company                1275 non-null   object 
 1   TypeName               1275 non-null   object 
 2   Inches                 1275 non-null   float64
 3   ScreenResolution       1275 non-null   object 
 4   CPU_Company            1275 non-null   object 
 5   CPU_Frequency (GHz)    1275 non-null   float64
 6   RAM (GB)               1275 non-null   int64  
 7   Memory Type            0 non-null      float64
 8   Memory Capacity        0 non-null      float64
 9   GPU_Company            1275 non-null   object 
 10  GPU_Type               1275 non-null   object 
 11  OpSys                  1275 non-null   object 
 12  Weight (kg)            1275 non-null   float64
 13  Price (SAR)            1275 non-null   float64
 14  Resolution_Width       1275 non-null   int64  
 15  Reso

In [24]:
data['GPU_Type'].nunique()

106

In [25]:
data[['GPU_Company', 'GPU_Type']].head()

Unnamed: 0,GPU_Company,GPU_Type
0,Intel,Iris Plus Graphics 640
1,Intel,HD Graphics 6000
2,Intel,HD Graphics 620
3,AMD,Radeon Pro 455
4,Intel,Iris Plus Graphics 650


**GPU**

In [26]:
data['GPU_Family'] = data['GPU_Type'].apply(lambda x: x.split(' ')[0])
data['GPU_Series'] = data['GPU_Type'].apply(lambda x: x.split(' ')[-1])

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Company                1275 non-null   object 
 1   TypeName               1275 non-null   object 
 2   Inches                 1275 non-null   float64
 3   ScreenResolution       1275 non-null   object 
 4   CPU_Company            1275 non-null   object 
 5   CPU_Frequency (GHz)    1275 non-null   float64
 6   RAM (GB)               1275 non-null   int64  
 7   Memory Type            0 non-null      float64
 8   Memory Capacity        0 non-null      float64
 9   GPU_Company            1275 non-null   object 
 10  GPU_Type               1275 non-null   object 
 11  OpSys                  1275 non-null   object 
 12  Weight (kg)            1275 non-null   float64
 13  Price (SAR)            1275 non-null   float64
 14  Resolution_Width       1275 non-null   int64  
 15  Reso

In [28]:
# Assign Performance Tier
def assign_tier(series):
    if pd.isna(series):
        return 'Unknown'
    elif series <= 500:
        return 'Low-End'
    elif 500 < series <= 800:
        return 'Mid-Range'
    else:
        return 'High-End'

In [29]:
# data['Performance_Tier'] = data['GPU_Series'].apply(assign_tier)
import pandas as pd
import re  # Importing re module to use regular expressions

# Assign Performance Tier with type checking
def assign_tier(series):
    try:  # Attempt to convert series to a numeric type
        series = float(series)
    except ValueError:  # Handle cases where conversion is not possible
        return 'Unknown'

    if pd.isna(series):
        return 'Unknown'
    elif series <= 500:
        return 'Low-End'
    elif 500 < series <= 800:
        return 'Mid-Range'
    else:
        return 'High-End'

# Apply the modified assign_tier function



In [30]:
data['Performance_Tier'] = data['GPU_Series'].apply(assign_tier)

In [31]:
data.columns

Index(['Company', 'TypeName', 'Inches', 'ScreenResolution', 'CPU_Company',
       'CPU_Frequency (GHz)', 'RAM (GB)', 'Memory Type', 'Memory Capacity',
       'GPU_Company', 'GPU_Type', 'OpSys', 'Weight (kg)', 'Price (SAR)',
       'Resolution_Width', 'Resolution_Height', 'Contains_HD', 'Contains_IPS',
       'Contains_Touchscreen', 'Contains_4K', 'Contains_Quad_HD_plus',
       'CPU_Family', 'CPU_Generation', 'CPU_Series', 'Intel_Series',
       'AMD_Series', 'Samsung_Series', 'Intel_Family', 'AMD_Family',
       'Samsung_Family', 'Intel_Generation', 'AMD_Generation',
       'Samsung_Generation', 'GPU_Family', 'GPU_Series', 'Performance_Tier'],
      dtype='object')

In [32]:
data['GPU_Series']=data['GPU_Series'].str.extract('(\d+)').fillna(data['GPU_Series'])

  data['GPU_Series']=data['GPU_Series'].str.extract('(\d+)').fillna(data['GPU_Series'])


In [33]:
data.columns

Index(['Company', 'TypeName', 'Inches', 'ScreenResolution', 'CPU_Company',
       'CPU_Frequency (GHz)', 'RAM (GB)', 'Memory Type', 'Memory Capacity',
       'GPU_Company', 'GPU_Type', 'OpSys', 'Weight (kg)', 'Price (SAR)',
       'Resolution_Width', 'Resolution_Height', 'Contains_HD', 'Contains_IPS',
       'Contains_Touchscreen', 'Contains_4K', 'Contains_Quad_HD_plus',
       'CPU_Family', 'CPU_Generation', 'CPU_Series', 'Intel_Series',
       'AMD_Series', 'Samsung_Series', 'Intel_Family', 'AMD_Family',
       'Samsung_Family', 'Intel_Generation', 'AMD_Generation',
       'Samsung_Generation', 'GPU_Family', 'GPU_Series', 'Performance_Tier'],
      dtype='object')

In [34]:
data=data.drop(['CPU_Family' , 'CPU_Generation' , 'CPU_Series'] ,axis=1)

In [35]:
data.isnull().sum()

Company                     0
TypeName                    0
Inches                      0
ScreenResolution            0
CPU_Company                 0
CPU_Frequency (GHz)         0
RAM (GB)                    0
Memory Type              1275
Memory Capacity          1275
GPU_Company                 0
GPU_Type                    0
OpSys                       0
Weight (kg)                 0
Price (SAR)                 0
Resolution_Width            0
Resolution_Height           0
Contains_HD                 0
Contains_IPS                0
Contains_Touchscreen        0
Contains_4K                 0
Contains_Quad_HD_plus       0
Intel_Series               61
AMD_Series               1215
Samsung_Series           1274
Intel_Family               61
AMD_Family               1215
Samsung_Family           1274
Intel_Generation           61
AMD_Generation           1215
Samsung_Generation       1274
GPU_Family                  0
GPU_Series                  0
Performance_Tier            0
dtype: int

In [36]:
data=data.drop('ScreenResolution', axis=1)

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Company                1275 non-null   object 
 1   TypeName               1275 non-null   object 
 2   Inches                 1275 non-null   float64
 3   CPU_Company            1275 non-null   object 
 4   CPU_Frequency (GHz)    1275 non-null   float64
 5   RAM (GB)               1275 non-null   int64  
 6   Memory Type            0 non-null      float64
 7   Memory Capacity        0 non-null      float64
 8   GPU_Company            1275 non-null   object 
 9   GPU_Type               1275 non-null   object 
 10  OpSys                  1275 non-null   object 
 11  Weight (kg)            1275 non-null   float64
 12  Price (SAR)            1275 non-null   float64
 13  Resolution_Width       1275 non-null   int64  
 14  Resolution_Height      1275 non-null   int64  
 15  Cont

**ordinal encding**


In [38]:
# ordinal=[Intel_Series, AMD_Series, Samsung_Series, Intel_Family, AMD_Family, Samsung_Family, Intel_Generation, AMD_Generation, Samsung_Generation,GPU_Family,GPU_Series ]

# other encoding=[Company,Product , TypeName , CPU_Company, Memory Type ,  GPU_Company, OpSys, ]

In [39]:
from sklearn.preprocessing import OrdinalEncoder

# List of ordinal columns
ordinal_cols = [
    'Intel_Series', 'AMD_Series', 'Samsung_Series',
    'Intel_Family', 'AMD_Family', 'Samsung_Family',
    'Intel_Generation', 'AMD_Generation', 'Samsung_Generation',
    'GPU_Family', 'GPU_Series', 'Performance_Tier'
]


# Replace NaN values with 0
# data[ordinal_cols] = data[ordinal_cols].fillna(0).astype(str)
data[ordinal_cols] = data[ordinal_cols].astype(str)
# Initialize the OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

# Apply the encoding
data[ordinal_cols] = ordinal_encoder.fit_transform(data[ordinal_cols])

In [40]:
data[ordinal_cols].head(30)

Unnamed: 0,Intel_Series,AMD_Series,Samsung_Series,Intel_Family,AMD_Family,Samsung_Family,Intel_Generation,AMD_Generation,Samsung_Generation,GPU_Family,GPU_Series,Performance_Tier
0,4.0,14.0,1.0,2.0,6.0,0.0,3.0,4.0,0.0,5.0,49.0,2.0
1,4.0,14.0,1.0,2.0,6.0,0.0,3.0,4.0,0.0,4.0,44.0,0.0
2,4.0,14.0,1.0,2.0,6.0,0.0,3.0,4.0,0.0,4.0,47.0,2.0
3,5.0,14.0,1.0,2.0,6.0,0.0,3.0,4.0,0.0,10.0,27.0,1.0
4,4.0,14.0,1.0,2.0,6.0,0.0,3.0,4.0,0.0,5.0,50.0,2.0
5,1.0,12.0,1.0,3.0,4.0,0.0,5.0,3.0,0.0,10.0,30.0,3.0
6,5.0,14.0,1.0,2.0,6.0,0.0,3.0,4.0,0.0,5.0,49.0,3.0
7,4.0,14.0,1.0,2.0,6.0,0.0,3.0,4.0,0.0,4.0,44.0,0.0
8,5.0,14.0,1.0,2.0,6.0,0.0,4.0,4.0,0.0,2.0,7.0,3.0
9,4.0,14.0,1.0,2.0,6.0,0.0,4.0,4.0,0.0,11.0,47.0,2.0


In [41]:
data[ordinal_cols].isna().sum()

Intel_Series          0
AMD_Series            0
Samsung_Series        0
Intel_Family          0
AMD_Family            0
Samsung_Family        0
Intel_Generation      0
AMD_Generation        0
Samsung_Generation    0
GPU_Family            0
GPU_Series            0
Performance_Tier      0
dtype: int64

In [42]:
# List of columns for one-hot encoding
other_encoding_cols = ['Company', 'TypeName', 'CPU_Company', 'Memory Type', 'GPU_Company', 'OpSys', 'GPU_Type']

# Apply one-hot encoding
data = pd.get_dummies(data, columns=other_encoding_cols, drop_first=True)


In [43]:
len(data.columns)

166

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Columns: 166 entries, Inches to GPU_Type_UHD Graphics 620
dtypes: bool(141), float64(17), int64(8)
memory usage: 424.7 KB


## Save the dataset

In [45]:
data.to_csv('dataset_post_engineering.csv', index=False)

# Model: CatBoostRegressor

In [46]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [47]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [48]:
X = data.drop(columns=["Price (SAR)"])  # Drop the target column
y = data["Price (SAR)"]  # Target column

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [50]:
from catboost import CatBoostRegressor

# Initialize CatBoost regressor
model = CatBoostRegressor(iterations=1300,  # Number of iterations
                          depth=8,          # Depth of the tree
                          learning_rate=0.1, # Learning rate
                          loss_function='MAE', # Loss function for regression
                          verbose=100)       # Verbose output for progress

# Fit the model
cat=model.fit(X_train, y_train, eval_set=(X_test, y_test), plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1952.3022966	test: 1927.1090130	best: 1927.1090130 (0)	total: 49.2ms	remaining: 1m 3s
100:	learn: 473.8187804	test: 727.9757943	best: 727.9757943 (100)	total: 124ms	remaining: 1.47s
200:	learn: 344.4423078	test: 704.4076391	best: 702.8969957 (169)	total: 201ms	remaining: 1.1s
300:	learn: 274.6924257	test: 688.4052134	best: 687.9421851 (298)	total: 278ms	remaining: 923ms
400:	learn: 241.1322541	test: 679.4955966	best: 678.7779940 (389)	total: 355ms	remaining: 796ms
500:	learn: 218.0165495	test: 672.8089007	best: 672.8015013 (498)	total: 431ms	remaining: 687ms
600:	learn: 198.0453793	test: 670.6177250	best: 669.9517839 (592)	total: 508ms	remaining: 591ms
700:	learn: 181.7830711	test: 672.6182485	best: 668.4302320 (634)	total: 586ms	remaining: 501ms
800:	learn: 172.0156072	test: 671.3515253	best: 668.4302320 (634)	total: 663ms	remaining: 413ms
900:	learn: 163.6964546	test: 672.5540189	best: 668.4302320 (634)	total: 744ms	remaining: 329ms
1000:	learn: 157.0536435	test: 672.188948

In [51]:
model = CatBoostRegressor(
    iterations=1200,
    depth=6,
    learning_rate=0.1,
    loss_function='MAE',
    verbose=100,
    random_seed=42  # Set a fixed random seed
)


In [52]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict on test set
y_pred_train = cat.predict(X_train)
y_pred_test = cat.predict(X_test)

# Calculate metrics
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)

train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print(f"Training MAE: {train_mae}, R²: {train_r2}")
print(f"Test MAE: {test_mae}, R²: {test_r2}")

Training MAE: 193.0344595331811, R²: 0.9785730361500415
Test MAE: 668.4302325326655, R²: 0.8811480279015217


Save the model 

In [58]:
import joblib

# Save the trained model
joblib.dump(cat, '../Models/cat_model-v4.pkl')

['../Models/cat_model-v4.pkl']

Save the ordinal encoder 

In [59]:
joblib.dump(ordinal_encoder, '../Models/ordinal_encoder.pkl')

['../Models/ordinal_encoder.pkl']

Save the data and the test data 

In [60]:
X_test.to_csv('../Data/X_test_v4.csv', index=False)

In [61]:
training_columns = X_train.columns.tolist()
joblib.dump(training_columns, '../Models/training_columns.pkl')

['../Models/training_columns.pkl']

In [62]:
data.to_csv('../Data/Dataset_encoding_v4.csv', index=False)