# Laptop price prediction
Source: https://platform.stratascratch.com/data-projects/laptop-price-prediction

## (1) Imports and setups

In [4]:
import pandas as pd
import numpy as np
import json

In [168]:
train_dt = pd.read_json("./datasets/train_dataset.json")
val_dt = pd.read_json("./datasets/val_dataset.json")
test_dt = pd.read_json("./datasets/test_dataset.json")

In [6]:
print(train_dt.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4711 entries, 7233 to 6037
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   graphic card type       4417 non-null   object 
 1   communications          4261 non-null   object 
 2   resolution (px)         4361 non-null   object 
 3   CPU cores               4711 non-null   object 
 4   RAM size                4457 non-null   object 
 5   operating system        4335 non-null   object 
 6   drive type              4454 non-null   object 
 7   input devices           4321 non-null   object 
 8   multimedia              4310 non-null   object 
 9   RAM type                4212 non-null   object 
 10  CPU clock speed (GHz)   4181 non-null   float64
 11  CPU model               4389 non-null   object 
 12  state                   4711 non-null   object 
 13  drive memory size (GB)  4439 non-null   float64
 14  warranty                4711 non-null   ob

In [7]:
print(val_dt.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1571 entries, 3849 to 4277
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   graphic card type       1471 non-null   object 
 1   communications          1409 non-null   object 
 2   resolution (px)         1442 non-null   object 
 3   CPU cores               1571 non-null   object 
 4   RAM size                1478 non-null   object 
 5   operating system        1437 non-null   object 
 6   drive type              1485 non-null   object 
 7   input devices           1426 non-null   object 
 8   multimedia              1420 non-null   object 
 9   RAM type                1394 non-null   object 
 10  CPU clock speed (GHz)   1375 non-null   float64
 11  CPU model               1467 non-null   object 
 12  state                   1571 non-null   object 
 13  drive memory size (GB)  1467 non-null   float64
 14  warranty                1571 non-null   ob

In [8]:
print(test_dt.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1571 entries, 5124 to 1371
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   graphic card type       1469 non-null   object 
 1   communications          1401 non-null   object 
 2   resolution (px)         1442 non-null   object 
 3   CPU cores               1571 non-null   object 
 4   RAM size                1468 non-null   object 
 5   operating system        1431 non-null   object 
 6   drive type              1479 non-null   object 
 7   input devices           1428 non-null   object 
 8   multimedia              1415 non-null   object 
 9   RAM type                1383 non-null   object 
 10  CPU clock speed (GHz)   1361 non-null   float64
 11  CPU model               1464 non-null   object 
 12  state                   1571 non-null   object 
 13  drive memory size (GB)  1466 non-null   float64
 14  warranty                1571 non-null   ob

## (2) Data manipulation

In [9]:
def explode_and_pivot(df0, varname):
    df = df0.copy()
    df["id"] = df.index
    df = df[["id", varname]]
    df_exploded = df.explode(varname)
    df_exploded["flag"] = 1
    df_pivot = df_exploded.pivot_table(index="id", columns=varname, values="flag")
    df0 = df0.merge(
        df_pivot, how="left", left_index=True, right_index=True, validate="1:1"
    )
    return df0

In [97]:
train_dt = explode_and_pivot(train_dt, "input devices")
train_dt = explode_and_pivot(train_dt, "multimedia")
train_dt = explode_and_pivot(train_dt, "communications")
train_dt = explode_and_pivot(train_dt, "operating system")


In [98]:
val_dt = explode_and_pivot(val_dt, "input devices")
val_dt = explode_and_pivot(val_dt, "multimedia")
val_dt = explode_and_pivot(val_dt, "communications")
val_dt = explode_and_pivot(val_dt, "operating system")

In [99]:
test_dt = explode_and_pivot(test_dt, "input devices")
test_dt = explode_and_pivot(test_dt, "multimedia")
test_dt = explode_and_pivot(test_dt, "communications")
test_dt = explode_and_pivot(test_dt, "operating system")

In [169]:
def clean_df(df):
    # df = df.drop(
    #     columns=[
    #         "input devices",
    #         "multimedia",
    #         "communications",
    #         "operating system",
    #     ]
    # )
    df["graphic card type"] = df["graphic card type"].fillna("None")
    df["resolution (px)"] = pd.to_numeric(
        df["resolution (px)"].str.split("x").str[0], errors="coerce"
    )
    df["CPU cores"] = pd.to_numeric(df["CPU cores"], errors="coerce")
    df["RAM size"] = pd.to_numeric(
        df["RAM size"].str.split(" ").str[0], errors="coerce"
    )
    df["screen size"] = pd.to_numeric(
        df["screen size"].str.split('"').str[0], errors="coerce"
    )
    return df


train_dt = clean_df(train_dt)
val_dt = clean_df(val_dt)
test_dt = clean_df(test_dt)

In [170]:
train_dt["screen size"].value_counts()

screen size
15.0    3177
17.0     540
14.0     534
13.0     113
11.9      94
12.0      56
Name: count, dtype: int64

## (3) EDA

In [128]:
train_dt["resolution (px)"].value_counts()

resolution (px)
1920     2785
1366     1234
1600      221
3840       84
2560       18
1280        7
3200        6
2880        3
2160        2
other       1
Name: count, dtype: int64

In [103]:
numvars = train_dt.select_dtypes(include=[np.number]).columns.to_list()
catvars = train_dt.select_dtypes(include=[object]).columns.to_list()
print("Numerical variables: ", numvars)
print("Categorical variables: ", catvars)

Numerical variables:  ['CPU clock speed (GHz)', 'drive memory size (GB)', 'buynow_price', 'illuminated keyboard', 'keyboard', 'numeric keyboard', 'touchpad', 'SD card reader', 'camera', 'microphone', 'speakers', 'bluetooth', 'gps', 'intel wireless display (widi)', 'lan 10/100 mbps', 'lan 10/100/1000 mbps', 'modem 3g (wwan)', 'modem 4g (lte)', 'nfc (near field communication)', 'wi-fi', 'wi-fi 802.11 a/b/g/n', 'wi-fi 802.11 a/b/g/n/ac', 'wi-fi 802.11 b/g/n', 'wi-fi 802.11 b/g/n/ac', 'linux', 'no system', 'other', 'windows 10 home', 'windows 10 professional', 'windows 7 professional 32-bit', 'windows 7 professional 64-bit', 'windows 8.1 home 32-bit', 'windows 8.1 home 64-bit', 'windows 8.1 professional 32-bit', 'windows 8.1 professional 64-bit']
Categorical variables:  ['graphic card type', 'communications', 'resolution (px)', 'CPU cores', 'RAM size', 'operating system', 'drive type', 'input devices', 'multimedia', 'RAM type', 'CPU model', 'state', 'warranty', 'screen size']


Check for missing values

In [67]:
pd.concat(
    [train_dt.isna().sum(), train_dt.isna().mean()],
    axis=1,
    keys=["Count", "Proportion"],
)

Unnamed: 0,Count,Proportion
graphic card type,294,0.062407
communications,450,0.095521
resolution (px),350,0.074294
CPU cores,0,0.0
RAM size,254,0.053916
operating system,376,0.079813
drive type,257,0.054553
input devices,390,0.082785
multimedia,401,0.08512
RAM type,499,0.105922


Univariate Analysis

In [112]:
print(train_dt[numvars[:12]].describe(percentiles=[0.95]))
print(train_dt[numvars[12:24]].describe(percentiles=[0.95]))
print(train_dt[numvars[24:]].describe(percentiles=[0.95]))

       CPU clock speed (GHz)  drive memory size (GB)  buynow_price  \
count            4181.000000             4439.000000   4711.000000   
mean                2.342057              652.619284   3495.831195   
std                 0.386298              467.657354   1727.933306   
min                 0.800000                0.000000    429.000000   
50%                 2.500000              500.000000   3184.000000   
95%                 2.800000             1500.000000   6779.000000   
max                 3.900000             2960.000000  15472.650000   

       illuminated keyboard  keyboard  numeric keyboard  touchpad  \
count                1597.0    4084.0            2086.0    4305.0   
mean                    1.0       1.0               1.0       1.0   
std                     0.0       0.0               0.0       0.0   
min                     1.0       1.0               1.0       1.0   
50%                     1.0       1.0               1.0       1.0   
95%                     1

Unnamed: 0,gps,intel wireless display (widi),lan 10/100 mbps,lan 10/100/1000 mbps,modem 3g (wwan),modem 4g (lte),nfc (near field communication),wi-fi,wi-fi 802.11 a/b/g/n,wi-fi 802.11 a/b/g/n/ac,wi-fi 802.11 b/g/n,wi-fi 802.11 b/g/n/ac
count,166.0,595.0,1657.0,2399.0,598.0,166.0,875.0,1760.0,12.0,79.0,44.0,103.0
mean,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
95%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [77]:
catvars

['graphic card type',
 'communications',
 'resolution (px)',
 'CPU cores',
 'RAM size',
 'operating system',
 'drive type',
 'input devices',
 'multimedia',
 'RAM type',
 'CPU model',
 'state',
 'warranty',
 'screen size']

In [92]:
for i in catvars:
    print(f"{i}: {train_dt[i].nunique()} {train_dt[i].unique()}")

graphic card type: 2 ['dedicated graphics' None 'integrated graphics']
resolution (px): 12 ['1920 x 1080' '1366 x 768' None '2560 x 1440' '1600 x 900' '3840 x 2160'
 'other' '1920 x 1280' '1280 x 800' '3200 x 1800' '2880 x 1620'
 '2160 x 1440' '1920 x 1200']
CPU cores: 6 ['4' '2' 'not applicable' '3' '1' '8']
RAM size: 9 ['32 gb' '8 gb' None '12 gb' '4 gb' '16 gb' '2 gb' '20 gb' '6 gb' '64 gb']
drive type: 5 ['ssd + hdd' 'ssd' 'hdd' None 'emmc' 'hybrid']
RAM type: 3 ['ddr4' 'ddr3' None 'ddr3l']
CPU model: 18 ['intel core i7' None 'other CPU' 'intel core i3' 'intel core i5'
 'intel celeron dual-core' 'intel pentium 4' 'intel celeron quad core '
 'amd a6' 'intel celeron ' 'intel pentium dual-core' 'amd a12' 'amd a8'
 'intel pentium quad-core ' 'intel core m' 'amd a4' 'intel celeron m'
 'amd a10' 'amd e1']
state: 1 ['new']
warranty: 3 ['producer warranty' 'seller warranty' 'no warranty']
screen size: 6 ['17" - 17.9"' '15" - 15.9"' None '12" - 12.9"' '14" - 14.9"'
 '13" - 13.9"' '11.9" and