# MODEL DEVELOPMENT

## Data Dictionary

source: https://www.kaggle.com/datasets/sushmita36/laptops-dataset

|    Field Name     | Data Type |                                          Description                                           |               Example               |
| :---------------: | :-------: | :--------------------------------------------------------------------------------------------: | :---------------------------------: |
|    model_name     |   Text    |                             The name or model number of the laptop                             | Lenovo V15 ITL G2 82KBA033IH Laptop |
|       brand       |   Text    |                                    The brand of the laptop                                     |               Lenovo                |
|     cpu_name      |   Text    |                              The name or model number of the CPU                               |          11th Gen Core i3           |
|        ram        |  Integer  |                                The amount of RAM (in gigabytes)                                |                  8                  |
|        ssd        |  Integer  |                        The size of the solid-state drive (in gigabytes)                        |                 512                 |
|        hdd        |  Integer  |                         The size of the hard disk drive (in gigabytes)                         |                1000                 |
|        os         |   Text    |                          The operating system installed on the laptop                          |               Windows               |
|     gpu_name      |  Integer  |                              The name or model number of the GPU                               |        Intel Integrated UHD         |
|    screen_size    |   Float   |                           The size of the laptop screen (in inches)                            |                15.6                 |
| screen_resolution |   Text    |                              The resolution of the laptop screen                               |             1920 x 1080             |
|    no_of_cores    |  Integer  |                                    The number of CPU cores                                     |                  2                  |
|   no_of_threads   |  Integer  |                                   The number of CPU threads                                    |                  4                  |
|    spec_score     |  Integer  | A numerical score representing the overall specifications from referenced website (e-commerce) |                 62                  |
|       price       |  Integer  |                                The price of the laptop (in INP)                                |                39900                |


## Initialization

In [25]:
import pandas as pd

pathname = 'datasets/laptop.csv'
df = pd.read_csv(pathname).drop(columns=["index"])
df = df.dropna()

# Covert price form INR to USD (optional)
# df['price'] = round(df['price'] * 0.012)

df

Unnamed: 0,model_name,brand,cpu_name,ram,ssd,hdd,os,gpu_name,screen_size,screen_resolution,no_of_cores,no_of_threads,spec_score,price
0,Lenovo V15 ITL G2 82KBA033IH Laptop,Lenovo,11th Gen Core i3,8,512,0,Windows,Intel Integrated UHD,15.6,1920 x 1080,2,4,62,33921
1,HP Pavilion 15-ec2004AX Gaming Laptop,HP,AMD Ryzen 5 5600H,8,512,0,Windows,4 GB NVIDIA GeForce GTX 1650,15.6,1920 x 1080,6,12,66,56150
2,Lenovo V15 82KBA03HIH Laptop,Lenovo,11th Gen Core i3,8,256,1000,Windows,Intel Integrated UHD,15.6,1920 x 1080,2,4,66,35499
3,Asus Vivobook 16X 2022 M1603QA-MB502WS Laptop,Asus,Ryzen 5-5600H,8,512,0,Windows,AMD Radeon Vega 7,16.0,1200 x 1920,6,12,66,48990
4,HP 15s-fr4000TU Laptop,HP,11th Gen Core i5,8,512,0,Windows,Intel Integrated Iris Xe,15.6,1920 x 1080,4,8,63,52990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,Fujitsu UH-X 4ZR1F38024 Laptop,Fujitsu,11th Gen Core i7,16,512,0,Windows,Intel Iris Xe Graphics,13.3,1920 x 1080,4,8,63,84990
1010,Asus BR1100CKA-GJ0722W Laptop,Asus,Celeron N4500,4,128,0,Windows,Intel UHD Graphics,11.6,1366 x 768,2,0,45,22990
1011,Lenovo Yoga 6 13ALC6 82ND00DPIN Laptop,Lenovo,Ryzen 7 5700U,16,1000,0,Windows,AMD Radeon AMD,13.3,1920 x 1080,8,16,68,92999
1012,Asus BR1100FKA-BP1104W Laptop,Asus,Celeron N4500,4,128,0,Windows,Intel Integrated UHD Graphics,11.6,1366 x 768,2,0,49,29990


## Preprocessing

In [4]:
cpus = []

def extract_cpu_brand(model_name):
    cpu_brand = "unknown"

    if "i3" in model_name or "i5" in model_name or "i7" in model_name or "i9" in model_name or "Celeron" in model_name or "Silver" in model_name or "Pentium" in model_name:
        cpu_brand = "Intel"
    elif "AMD" in model_name or "Ryzen" in model_name or "Athlon" in model_name:
        cpu_brand = "AMD"
    elif "Snapdragon" in model_name:
        cpu_brand = "Snapdragon"
    elif "Apple" in model_name or "M1" in model_name or "M2" in model_name:
        cpu_brand = "Apple"

    return cpu_brand

for index, row in df.iterrows():
    model_name = row["cpu_name"]

    cpu = {
        "cpu_brand_id": len(cpus) + 1,
        "cpu_brand": extract_cpu_brand(model_name),
    }

    if not any(d['cpu_brand'] == cpu['cpu_brand'] for d in cpus):
        cpus.append(cpu)

cpu_df = pd.DataFrame(cpus)
cpu_df

Unnamed: 0,cpu_brand_id,cpu_brand
0,1,Intel
1,2,AMD
2,3,Apple
3,4,unknown
4,5,Snapdragon


In [5]:
gpus = [
    {
        "gpu_brand_id": 1,
        "gpu_brand": "None",
    }
]

def extract_gpu_brand(model_name):
    gpu_brand = "Unknown"

    if "Intel" in model_name or "UHD" in model_name or "Iris" in model_name:
        gpu_brand = "Intel"
    elif "AMD" in model_name or "Radeon" in model_name:
        gpu_brand = "AMD"
    elif "Snapdragon" in model_name:
        gpu_brand = "Snapdragon"
    elif "Apple" in row["cpu_name"] or "M1" in row["cpu_name"] or "M2" in row["cpu_name"]:
        gpu_brand = "Apple"
    elif "NVIDIA" in model_name or "Nvidia" in model_name or "GeForce" in model_name:
        gpu_brand = "NVIDIA"
    elif "Qualcomm" in model_name:
        gpu_brand = "Qualcomm"

    return gpu_brand

for index, row in df.iterrows():
    model_name = row["gpu_name"]

    gpu = {
        "gpu_brand_id": len(gpus) + 1,
        "gpu_brand": extract_gpu_brand(model_name),
    }

    if not any(d['gpu_brand'] == gpu['gpu_brand'] for d in gpus):
        gpus.append(gpu)

gpu_df = pd.DataFrame(gpus)
gpu_df

Unnamed: 0,gpu_brand_id,gpu_brand
0,1,
1,2,Intel
2,3,NVIDIA
3,4,AMD
4,5,Unknown
5,6,Apple
6,7,Qualcomm


In [6]:
brand_list = []
os_list = []
size_list = []
reso_list = []

for index, row in df.iterrows():
    if not any(row['brand'] == d['brand'] for d in brand_list):
        brand_list.append({
            "brand_id": len(brand_list) + 1,
            "brand": row['brand'],
        })

    if not any(row['os'] == d['os'] for d in os_list):
        os_list.append({
            "os_id": len(os_list) + 1,
            "os": row['os'],
        })

    if not any(row['screen_size'] == d['size'] for d in size_list):
        size_list.append({
            "size_id": len(size_list) + 1,
            "size": row['screen_size'],
        })

    if not any(row['screen_resolution'] == d['reso'] for d in reso_list):
        reso_list.append({
            "reso_id": len(reso_list) + 1,
            "reso": row['screen_resolution'],
        })

brand_df = pd.DataFrame(brand_list)
os_df = pd.DataFrame(os_list)
size_df = pd.DataFrame(size_list)
reso_df = pd.DataFrame(reso_list)

In [7]:
brand_df.head()

Unnamed: 0,brand_id,brand
0,1,Lenovo
1,2,HP
2,3,Asus
3,4,Infinix
4,5,Dell


In [8]:
os_df.head()

Unnamed: 0,os_id,os
0,1,Windows
1,2,Mac
2,3,DOS
3,4,Chrome
4,5,Ubuntu


In [9]:
size_df.head()

Unnamed: 0,size_id,size
0,1,15.6
1,2,16.0
2,3,15.56
3,4,13.3
4,5,14.0


In [10]:
reso_df.head()

Unnamed: 0,reso_id,reso
0,1,1920 x 1080
1,2,1200 x 1920
2,3,1366 x 768
3,4,2560 x 1600
4,5,1440 x 2160


In [11]:
df_processed = df.copy()

df_processed['cpu_brand'] = df['cpu_name'].apply(extract_cpu_brand)
df_processed['gpu_brand'] = df['gpu_name'].apply(extract_gpu_brand)

df_processed

Unnamed: 0,item_id,model_name,brand,cpu_name,ram,ssd,hdd,os,gpu_name,screen_size,screen_resolution,no_of_cores,no_of_threads,spec_score,price,cpu_brand,gpu_brand
0,1,Lenovo V15 ITL G2 82KBA033IH Laptop,Lenovo,11th Gen Core i3,8,512,0,Windows,Intel Integrated UHD,15.6,1920 x 1080,2,4,62,33921,Intel,Intel
1,2,HP Pavilion 15-ec2004AX Gaming Laptop,HP,AMD Ryzen 5 5600H,8,512,0,Windows,4 GB NVIDIA GeForce GTX 1650,15.6,1920 x 1080,6,12,66,56150,AMD,NVIDIA
2,3,Lenovo V15 82KBA03HIH Laptop,Lenovo,11th Gen Core i3,8,256,1000,Windows,Intel Integrated UHD,15.6,1920 x 1080,2,4,66,35499,Intel,Intel
3,4,Asus Vivobook 16X 2022 M1603QA-MB502WS Laptop,Asus,Ryzen 5-5600H,8,512,0,Windows,AMD Radeon Vega 7,16.0,1200 x 1920,6,12,66,48990,AMD,AMD
4,5,HP 15s-fr4000TU Laptop,HP,11th Gen Core i5,8,512,0,Windows,Intel Integrated Iris Xe,15.6,1920 x 1080,4,8,63,52990,Intel,Intel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,1010,Fujitsu UH-X 4ZR1F38024 Laptop,Fujitsu,11th Gen Core i7,16,512,0,Windows,Intel Iris Xe Graphics,13.3,1920 x 1080,4,8,63,84990,Intel,Intel
1010,1011,Asus BR1100CKA-GJ0722W Laptop,Asus,Celeron N4500,4,128,0,Windows,Intel UHD Graphics,11.6,1366 x 768,2,0,45,22990,Intel,Intel
1011,1012,Lenovo Yoga 6 13ALC6 82ND00DPIN Laptop,Lenovo,Ryzen 7 5700U,16,1000,0,Windows,AMD Radeon AMD,13.3,1920 x 1080,8,16,68,92999,AMD,AMD
1012,1013,Asus BR1100FKA-BP1104W Laptop,Asus,Celeron N4500,4,128,0,Windows,Intel Integrated UHD Graphics,11.6,1366 x 768,2,0,49,29990,Intel,Intel


In [12]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1011 entries, 0 to 1013
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   item_id            1011 non-null   int64  
 1   model_name         1011 non-null   object 
 2   brand              1011 non-null   object 
 3   cpu_name           1011 non-null   object 
 4   ram                1011 non-null   int64  
 5   ssd                1011 non-null   int64  
 6   hdd                1011 non-null   int64  
 7   os                 1011 non-null   object 
 8   gpu_name           1011 non-null   object 
 9   screen_size        1011 non-null   float64
 10  screen_resolution  1011 non-null   object 
 11  no_of_cores        1011 non-null   int64  
 12  no_of_threads      1011 non-null   int64  
 13  spec_score         1011 non-null   int64  
 14  price              1011 non-null   int64  
 15  cpu_brand          1011 non-null   object 
 16  gpu_brand          1011 non-n

In [13]:
feature_names = ["brand", "ram", "ssd", "hdd", "no_of_cores", "no_of_threads", "cpu_brand", "gpu_brand", "screen_size", "screen_resolution", "os"]
target = "price"

In [14]:
# Create a dictionary to map original values to their corresponding IDs
gpu_id_map = dict(zip(gpu_df['gpu_brand'], gpu_df['gpu_brand_id']))
brand_id_map = dict(zip(brand_df['brand'], brand_df['brand_id']))
cpu_id_map = dict(zip(cpu_df['cpu_brand'], cpu_df['cpu_brand_id']))
os_id_map = dict(zip(os_df['os'], os_df['os_id']))

reso_id_map = dict(zip(reso_df['reso'], reso_df['reso_id']))
size_id_map = dict(zip(size_df['size'], size_df['size_id']))

df_exp = df_processed.copy()

# Replace the original column with the IDs
df_exp['gpu_brand'] = df_exp['gpu_brand'].map(gpu_id_map)
df_exp['brand'] = df_exp['brand'].map(brand_id_map)
df_exp['cpu_brand'] = df_exp['cpu_brand'].map(cpu_id_map)
df_exp['os'] = df_exp['os'].map(os_id_map)

df_exp['screen_resolution'] = df_exp['screen_resolution'].map(reso_id_map)
df_exp['screen_size'] = df_exp['screen_size'].map(size_id_map)

df_exp = df_exp[feature_names]
df_target = df_processed['price']

print(f"{df_exp.head()}\n")
print(f"{df_target.head()}\n")

   brand  ram  ssd   hdd  no_of_cores  no_of_threads  cpu_brand  gpu_brand  \
0      1    8  512     0            2              4          1          2   
1      2    8  512     0            6             12          2          3   
2      1    8  256  1000            2              4          1          2   
3      3    8  512     0            6             12          2          4   
4      2    8  512     0            4              8          1          2   

   screen_size  screen_resolution  os  
0            1                  1   1  
1            1                  1   1  
2            1                  1   1  
3            2                  2   1  
4            1                  1   1  

0    33921
1    56150
2    35499
3    48990
4    52990
Name: price, dtype: int64



In [15]:
feature_set_a = ["brand", "ram", "ssd", "hdd", "no_of_cores", "no_of_threads", "cpu_brand", "gpu_brand", "screen_size", "screen_resolution", "os"]
feature_set_b = ["brand", "ram", "ssd", "hdd", "no_of_cores", "cpu_brand", "gpu_brand", "screen_size", "screen_resolution"]

## Model

In [16]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

# Load the data
X = df_exp
y = df_target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models and hyperparameters for tuning
models = [
    {
        'name': 'Linear Regression',
        'model': LinearRegression(),
        'params': {}
    },
    {
        'name': 'Random Forest',
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5, 10]
        }
    },
    {
        'name': 'Gradient Boosting',
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.05, 0.1, 0.2],
            'max_depth': [3, 4, 5]
        }
    }
]

# Perform grid search and evaluation for each model
results = []

for model_info in models:
    print(f"Training {model_info['name']}...")
    model = model_info['model']
    param_grid = model_info['params']
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    mape = mean_absolute_percentage_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Save the best model
    model_name = model_info['name'].replace(' ', '_').lower() + '.pkl'
    joblib.dump(best_model, f"models/{model_name}")
    
    results.append({
        'model': model_info['name'],
        'best_params': grid_search.best_params_,
        'mae': mae,
        'mse': mse,
        'mape': mape,
        'r2': r2,
        'model_file': model_name  # Include the filename of the saved model
    })

# Display results
print("\nResults:")
for result in results:
    print(f"Model: {result['model']}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Mean Absolute Error: {result['mae']}")
    print(f"Mean Squared Error: {result['mse']}")
    print(f"Mean Absolute Percentage Error: {result['mape']}")
    print(f"R-squared: {result['r2']}")
    print(f"Saved Model: {result['model_file']}")
    print()


Training Linear Regression...
Training Random Forest...
Training Gradient Boosting...

Results:
Model: Linear Regression
Best Parameters: {}
Mean Absolute Error: 19559.329058843607
Mean Squared Error: 679418084.0147741
Mean Absolute Percentage Error: 0.25161952764962264
R-squared: 0.7607080543541955
Saved Model: linear_regression.pkl

Model: Random Forest
Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Mean Absolute Error: 14962.726861899515
Mean Squared Error: 521427060.65412897
Mean Absolute Percentage Error: 0.16868862292464679
R-squared: 0.8163527012425148
Saved Model: random_forest.pkl

Model: Gradient Boosting
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
Mean Absolute Error: 14985.440505090335
Mean Squared Error: 511603115.29240274
Mean Absolute Percentage Error: 0.17138142995043795
R-squared: 0.8198127077610848
Saved Model: gradient_boosting.pkl



### Feature Set Comparison

#### feature_set_0

##### Model: Linear Regression
- Best Parameters: {}
- Mean Absolute Error: 20935.84058672177
- Mean Squared Error: 772414613.880194
- Mean Absolute Percentage Error: 0.2708126182449002
- R-squared: 0.7279545538316504
- Saved Model: linear_regression.pkl

##### Model: Random Forest
- Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}
- Mean Absolute Error: 15882.951234359563
- Mean Squared Error: 537539648.0166775
- Mean Absolute Percentage Error: 0.17486507258144607
- R-squared: 0.8106778267137285
- Saved Model: random_forest.pkl

##### Model: Gradient Boosting
- Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
- Mean Absolute Error: 15724.002185247467
- Mean Squared Error: 554644534.361942
- Mean Absolute Percentage Error: 0.1663760865413565
- R-squared: 0.8046534631739442
- Saved Model: gradient_boosting.pkl

#### feature_set_a

##### Model: Linear Regression
- Best Parameters: {}
- Mean Absolute Error: 19559.329058843607
- Mean Squared Error: 679418084.0147741
- Mean Absolute Percentage Error: 0.25161952764962264
- R-squared: 0.7607080543541955
- Saved Model: linear_regression.pkl

##### Model: Random Forest
- Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 300}
- Mean Absolute Error: 14748.402459093777
- Mean Squared Error: 526816136.0826891
- Mean Absolute Percentage Error: 0.16517094376821118
- R-squared: 0.8144546617659792
- Saved Model: random_forest.pkl

##### Model: Gradient Boosting
- Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
- Mean Absolute Error: 14966.710749284093
- Mean Squared Error: 512410988.88671696
- Mean Absolute Percentage Error: 0.17104070987171976
- R-squared: 0.8195281736152215
- Saved Model: gradient_boosting.pkl

#### feature_set_b

##### Model: Linear Regression
- Best Parameters: {}
- Mean Absolute Error: 19141.844169972268
- Mean Squared Error: 694658127.306757
- Mean Absolute Percentage Error: 0.24212697977341346
- R-squared: 0.7553404909983377
- Saved Model: linear_regression.pkl

##### Model: Random Forest
- Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
- Mean Absolute Error: 15683.921496638064
- Mean Squared Error: 563865909.9158401
- Mean Absolute Percentage Error: 0.18376758424628656
- R-squared: 0.8014056825367497
- Saved Model: random_forest.pkl

##### Model: Gradient Boosting
- Best Parameters: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100}
- Mean Absolute Error: 15348.746939352897
- Mean Squared Error: 526346752.6869782
- Mean Absolute Percentage Error: 0.1791266346601128
- R-squared: 0.8146199792172749
- Saved Model: gradient_boosting.pkl

## Serialized Model Testing

In [4]:
import joblib
import pandas as pd

# Load the saved model from the pickle file
loaded_rf_model = joblib.load('models/random_forest.pkl')
loaded_gb_model = joblib.load('models/gradient_boosting.pkl')

# Using the loaded model for prediction
X_new = pd.DataFrame([[1, 8, 512, 0, 2, 4, 1, 2, 1, 1, 1]], columns=X.columns)
y_pred_rf = loaded_rf_model.predict(X_new)
y_pred_gb = loaded_gb_model.predict(X_new)

print("RF:", y_pred_rf)
print("GB:", y_pred_gb)

RF: [37864.76114345]
GB: [37897.50055898]


In [36]:
import json

# Combine all dictionaries into one
combined_dict = {
    'brand': brand_df.set_index('brand_id').to_dict(orient='index'),
    'resolution': reso_df.set_index('reso_id').to_dict(orient='index'),
    'size': size_df.set_index('size_id').to_dict(orient='index'),
    'gpu_brand': gpu_df.set_index('gpu_brand_id').to_dict(orient='index'),
    'cpu_brand': cpu_df.set_index('cpu_brand_id').to_dict(orient='index'),
    'os': cpu_df.set_index('os_id').to_dict(orient='index')
}

transformed_dict = {'laptop_brand': {key: value['brand'] for key, value in combined_dict['brand'].items()},
                    'gpu_brand': {key: value['gpu_brand'] for key, value in combined_dict['gpu_brand'].items()},
                    'cpu_brand': {key: value['cpu_brand'] for key, value in combined_dict['cpu_brand'].items()},
                    'resolution': {key: value['reso'] for key, value in combined_dict['resolution'].items()},
                    'size': {key: value['size'] for key, value in combined_dict['size'].items()},
                    'os': {key: value['os'] for key, value in combined_dict['os'].items()}}

file_path = 'laptop_options.json'
with open(file_path, 'w') as json_file:
    json.dump(transformed_dict, json_file)

print("JSON file saved successfully.")

JSON file saved successfully.
