# Price prediction for LapTops

<b> Our goals: </b>

      Clear data
      Transform to numerical data
      Visualize dependings
      Choose best features for prediction

<b> Import required modules </b>

In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import scipy as sc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [37]:
data = pd.read_csv("items.csv")

In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3826 entries, 0 to 3825
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Num of kernels  3822 non-null   object
 1   Base frequency  3687 non-null   object
 2   RAM             3826 non-null   object
 3   SSD             3822 non-null   object
 4   Card            3826 non-null   object
 5   HDD             3552 non-null   object
 6   Price           3826 non-null   object
 7   Name            3826 non-null   object
 8   Diagonal        3822 non-null   object
dtypes: object(9)
memory usage: 269.1+ KB


As we can see all of features have <b>Dtype</b> <b>object</b> - so we should transform them to <b>float</b> or <b>drop</b> them(if it does not need)

In [39]:
data.describe()

Unnamed: 0,Num of kernels,Base frequency,RAM,SSD,Card,HDD,Price,Name,Diagonal
count,3822,3687,3826,3822,3826,3552,3826,3826,3822
unique,9,62,13,37,468,10,2157,3826,29
top,4,26,16,512,Intel Iris Xe Graphics,немає,39999грн,HP Pavilion Gaming 17 Black (2H6K2EA),156
freq,1615,543,1886,1719,547,3307,55,1,1781


# Preproccesing and cleaning
    Let's clear data by every feature:

In [40]:
print(f"Start shape of dataset: {data.shape}")

Start shape of dataset: (3826, 9)


## SSD

In [41]:
num_of_SSD_distribution = pd.DataFrame(data["SSD"].value_counts())
num_of_SSD_distribution.index.name = "SSD"
num_of_SSD_distribution.rename(columns={"SSD":"Qty of rows"}, inplace=True)
display(num_of_kernels_distribution)

Unnamed: 0_level_0,Qty of rows
SSD,Unnamed: 1_level_1
512,1722
256,894
1000,821
2000,107
128,100
0,65
4000,32
8000,23
64,21
1024,14


In [42]:
data["SSD"] = data["SSD"].fillna(0)
data["SSD"] = data["SSD"].apply(lambda x: str(x)).str.split().apply(lambda x: x[0]).str.split("+").apply(lambda x: x[0]).str.split("х").apply(lambda x: x[0])

data["SSD"] = data["SSD"].str.replace("тисячу", "1000")
data["SSD"] = data["SSD"].str.replace("немає", "0")

data.drop(data[(data["SSD"] == "2") | (data["SSD"] == "1")].index, inplace=True)

In [43]:
num_of_SSD_distribution = pd.DataFrame(data["SSD"].value_counts())
num_of_SSD_distribution.index.name = "SSD"
num_of_SSD_distribution.rename(columns={"SSD":"Qty of rows"}, inplace=True)
display(num_of_kernels_distribution)

Unnamed: 0_level_0,Qty of rows
SSD,Unnamed: 1_level_1
512,1722
256,894
1000,821
2000,107
128,100
0,65
4000,32
8000,23
64,21
1024,14


## Num of kernels
    Delete rows with values "немає даних" because of low number of these values

In [44]:
num_of_kernels_distribution = pd.DataFrame(data["Num of kernels"].value_counts())
num_of_kernels_distribution.index.name = "Num of kernels"
num_of_kernels_distribution.rename(columns={"Num of kernels":"Qty of rows"}, inplace=True)
display(num_of_kernels_distribution)

Unnamed: 0_level_0,Qty of rows
Num of kernels,Unnamed: 1_level_1
4,1614
8,840
6,830
2,432
10,71
14,20
немає даних,6
16,1
5,1


In [45]:
data.drop(data[(data["Num of kernels"] == "немає даних")].index,
          inplace=True)

In [46]:
num_of_kernels_distribution = pd.DataFrame(data["Num of kernels"].value_counts())
num_of_kernels_distribution.index.name = "Num of kernels"
num_of_kernels_distribution.rename(columns={"Num of kernels":"Qty of rows"}, inplace=True)
display(num_of_kernels_distribution)

Unnamed: 0_level_0,Qty of rows
Num of kernels,Unnamed: 1_level_1
4,1614
8,840
6,830
2,432
10,71
14,20
16,1
5,1


## Base frequency 
    Delete rows with values "немає" and replace "," to "." for datatype transformation

In [47]:
num_of_base_frequencies_distribution = pd.DataFrame(data["Base frequency"].value_counts())
num_of_base_frequencies_distribution.index.name = "Base frequency"
num_of_base_frequencies_distribution.rename(columns={"Base frequency":"Qty of rows"}, inplace=True)
print(num_of_kernels_distribution.to_markdown())

|   Num of kernels |   Qty of rows |
|-----------------:|--------------:|
|                4 |          1614 |
|                8 |           840 |
|                6 |           830 |
|                2 |           432 |
|               10 |            71 |
|               14 |            20 |
|               16 |             1 |
|                5 |             1 |


In [48]:
data = data.drop(data[(data["Base frequency"] == "немає")].index)
data["Base frequency"] = data["Base frequency"].str.replace(",",".")

In [49]:
num_of_base_frequencies_distribution = pd.DataFrame(data["Base frequency"].value_counts())
num_of_base_frequencies_distribution.index.name = "Base frequency"
num_of_base_frequencies_distribution.rename(columns={"Base frequency":"Qty of rows"}, inplace=True)
print(num_of_kernels_distribution.to_markdown())

|   Num of kernels |   Qty of rows |
|-----------------:|--------------:|
|                4 |          1614 |
|                8 |           840 |
|                6 |           830 |
|                2 |           432 |
|               10 |            71 |
|               14 |            20 |
|               16 |             1 |
|                5 |             1 |


## RAM
     Delete rows with values "немає"

In [50]:
num_of_kernels_distribution = pd.DataFrame(data["RAM"].value_counts())
num_of_kernels_distribution.index.name = "RAM"
num_of_kernels_distribution.rename(columns={"RAM":"Qty of rows"}, inplace=True)
display(num_of_kernels_distribution)

Unnamed: 0_level_0,Qty of rows
RAM,Unnamed: 1_level_1
16,1878
8,1250
32,433
4,148
64,65
12,25
128,2
6,2
24,2
немає даних,1


In [51]:
data = data.drop(data[(data["RAM"] == 'немає даних')].index)

In [52]:
num_of_kernels_distribution = pd.DataFrame(data["RAM"].value_counts())
num_of_kernels_distribution.index.name = "RAM"
num_of_kernels_distribution.rename(columns={"RAM":"Qty of rows"}, inplace=True)
display(num_of_kernels_distribution)

Unnamed: 0_level_0,Qty of rows
RAM,Unnamed: 1_level_1
16,1878
8,1250
32,433
4,148
64,65
12,25
128,2
6,2
24,2
48,1


## HDD 
    Replace rows with values Nan, "немає даних", "ні" and "немає" to 0. Delete rows with value "32eMMC".

In [53]:
num_of_kernels_distribution = pd.DataFrame(data["HDD"].value_counts())
num_of_kernels_distribution.index.name = "HDD"
num_of_kernels_distribution.rename(columns={"HDD":"Qty of rows"}, inplace=True)
display(num_of_kernels_distribution)

Unnamed: 0_level_0,Qty of rows
HDD,Unnamed: 1_level_1
немає,3289
1000,179
1024,42
500,10
2000,4
ні,4
2048,2
немає даних,2
1000 + 8 (SSD-cache),1
32eMMC,1


In [54]:
data["HDD"] = data["HDD"].fillna(0)
data["HDD"] = data["HDD"].apply(lambda x: str(x)).str.split().apply(lambda x: x[0])

data.drop(data[(data["HDD"] == "32eMMC")].index, inplace=True)

data["HDD"] = data["HDD"].replace("немає", 0)
data["HDD"] = data["HDD"].replace("ні", 0)

data["HDD"] = pd.to_numeric(data["HDD"])


In [55]:
num_of_kernels_distribution = pd.DataFrame(data["HDD"].value_counts())
num_of_kernels_distribution.index.name = "HDD"
num_of_kernels_distribution.rename(columns={"HDD":"Qty of rows"}, inplace=True)
display(num_of_kernels_distribution)

Unnamed: 0_level_0,Qty of rows
HDD,Unnamed: 1_level_1
0,3569
1000,180
1024,42
500,10
2000,4
2048,2


## Price
    Delete "грн" after price.

In [56]:
num_of_kernels_distribution = pd.DataFrame(data["Price"].value_counts())
num_of_kernels_distribution.index.name = "Price"
num_of_kernels_distribution.rename(columns={"Price":"Qty of rows"}, inplace=True)
print(num_of_kernels_distribution.to_markdown())

| Price     |   Qty of rows |
|:----------|--------------:|
| 39999грн  |            55 |
| 45999грн  |            53 |
| 33999грн  |            45 |
| 36999грн  |            44 |
| 28999грн  |            42 |
| 43999грн  |            41 |
| 42999грн  |            41 |
| 49999грн  |            40 |
| 29999грн  |            38 |
| 46999грн  |            38 |
| 26999грн  |            36 |
| 40999грн  |            36 |
| 24999грн  |            35 |
| 25999грн  |            35 |
| 35999грн  |            34 |
| 27999грн  |            34 |
| 37999грн  |            34 |
| 34999грн  |            33 |
| 23999грн  |            32 |
| 44999грн  |            31 |
| 38999грн  |            29 |
| 21999грн  |            29 |
| 59999грн  |            28 |
| 55999грн  |            27 |
| 47999грн  |            25 |
| 52999грн  |            25 |
| 32999грн  |            24 |
| 19999грн  |            23 |
| 41999грн  |            22 |
| 79999грн  |            22 |
| 53999грн  |            20 |
| 69999грн

In [57]:
data["Price"] = data["Price"].str.replace("грн","")
data["Price"] = pd.to_numeric(data["Price"])

In [58]:
num_of_kernels_distribution = pd.DataFrame(data["Price"].value_counts())
num_of_kernels_distribution.index.name = "Price"
num_of_kernels_distribution.rename(columns={"Price":"Qty of rows"}, inplace=True)
print(num_of_kernels_distribution.to_markdown())

|   Price |   Qty of rows |
|--------:|--------------:|
|   39999 |            55 |
|   45999 |            53 |
|   33999 |            45 |
|   36999 |            44 |
|   28999 |            42 |
|   42999 |            41 |
|   43999 |            41 |
|   49999 |            40 |
|   46999 |            38 |
|   29999 |            38 |
|   26999 |            36 |
|   40999 |            36 |
|   24999 |            35 |
|   25999 |            35 |
|   35999 |            34 |
|   27999 |            34 |
|   37999 |            34 |
|   34999 |            33 |
|   23999 |            32 |
|   44999 |            31 |
|   38999 |            29 |
|   21999 |            29 |
|   59999 |            28 |
|   55999 |            27 |
|   47999 |            25 |
|   52999 |            25 |
|   32999 |            24 |
|   19999 |            23 |
|   41999 |            22 |
|   79999 |            22 |
|   69999 |            20 |
|   53999 |            20 |
|   20999 |            18 |
|   22999 |         

## Diagonal
    Replace "," to "." and remove row with "13.3 + 10.8" diagonal

In [59]:
num_of_kernels_distribution = pd.DataFrame(data["Diagonal"].value_counts())
num_of_kernels_distribution.index.name = "Diagonal"
num_of_kernels_distribution.rename(columns={"Diagonal":"Qty of rows"}, inplace=True)
display(num_of_kernels_distribution)

Unnamed: 0_level_0,Qty of rows
Diagonal,Unnamed: 1_level_1
156,1779
14,768
173,418
133,385
16,141
142,58
161,50
154,34
135,33
162,28


In [60]:
data.drop(data[data["Diagonal"] == "13,3 + 10,8"].index, inplace=True)

data["Diagonal"] = data["Diagonal"].str.replace(",",".")

In [61]:
num_of_kernels_distribution = pd.DataFrame(data["Diagonal"].value_counts())
num_of_kernels_distribution.index.name = "Diagonal"
num_of_kernels_distribution.rename(columns={"Diagonal":"Qty of rows"}, inplace=True)
display(num_of_kernels_distribution)

Unnamed: 0_level_0,Qty of rows
Diagonal,Unnamed: 1_level_1
15.6,1783
14.0,768
17.3,421
13.3,387
16.0,141
14.2,58
16.1,50
15.4,34
13.5,33
16.2,28


## Name 
    Is not nessesary field

In [62]:
data.drop(["Name"],axis=1, inplace=True)

## Card
    This field is not enoght for prediction, so we need to add more information about this videocards

In [63]:
print((data["Card"].str.lower().str.split(",").apply(lambda x: x[0]).str.split(" ").apply(lambda x: " ".join(x[:4])).value_counts()).shape)
print(data["Card"].str.lower().str.split(",").apply(lambda x: x[0]).str.split(" ").apply(lambda x: " ".join(x[:4])).value_counts().to_markdown())

(141,)
|                               |   Card |
|:------------------------------|-------:|
| intel iris xe graphics        |    553 |
| intel uhd graphics            |    437 |
| amd radeon graphics           |    283 |
| nvidia geforce rtx 3060       |    244 |
| nvidia geforce rtx 3050       |    222 |
| nvidia geforce rtx 2070       |    167 |
| nvidia geforce gtx 1650       |    166 |
| nvidia geforce rtx 3070       |    164 |
| nvidia geforce rtx 2060       |    162 |
| intel uhd graphics 620        |    127 |
| intel iris plus graphics      |    117 |
| nvidia geforce gtx 1660       |     81 |
| nvidia geforce rtx 3080       |     73 |
| nvidia geforce rtx 2080       |     65 |
| nvidia geforce gtx 1660ti     |     62 |
| amd radeon pro 5500m          |     50 |
| nvidia geforce mx450          |     47 |
| nvidia geforce mx350          |     43 |
| nvidia geforce mx330          |     40 |
| nvidia geforce gtx 1650ti     |     33 |
| intel uhd graphics 600        |     32 |
| nv

## Data Cleaning
    As we can see lots of missing is Num of kernels and Base frequency that we can not fill. But it's not so a lot so we can drop them.

In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3806 entries, 0 to 3825
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Num of kernels  3802 non-null   object
 1   Base frequency  3667 non-null   object
 2   RAM             3806 non-null   object
 3   SSD             3806 non-null   object
 4   Card            3806 non-null   object
 5   HDD             3806 non-null   int64 
 6   Price           3806 non-null   int64 
 7   Diagonal        3802 non-null   object
dtypes: int64(2), object(6)
memory usage: 267.6+ KB


In [65]:
data.dropna(inplace=True)

In [66]:
print(f"End shape of dataset: {data.shape}")

End shape of dataset: (3660, 8)


# Visualization