In [18]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import plot_tree
sb.set()
import re

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score

In [19]:
laptop_data = pd.read_csv('./laptop_price.csv', encoding='latin-1')
laptop_data.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [20]:
print("Data type : ", type(laptop_data))
print("Data dims : ", laptop_data.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (1303, 13)


In [33]:
#Check if there are duplications
duplicate = laptop_data.duplicated().sum()
print("Duplications: " + str(duplicate))
print("Since there are not duplicated sets, there is no need to remove")

Duplications: 0
Since there are not duplicated sets, there is no need to remove


In [22]:
#Removing the laptop ID column
laptop_data_clean = laptop_data.copy()
laptop_data_clean.pop('laptop_ID')

0          1
1          2
2          3
3          4
4          5
        ... 
1298    1316
1299    1317
1300    1318
1301    1319
1302    1320
Name: laptop_ID, Length: 1303, dtype: int64

In [23]:
#Combining the string together for preparation to take only the resolution
laptop_data_clean["ScreenResolution"] = laptop_data_clean["ScreenResolution"].str.replace(" ", "")
laptop_data_clean["ScreenResolution"]

0             IPSPanelRetinaDisplay2560x1600
1                                   1440x900
2                            FullHD1920x1080
3             IPSPanelRetinaDisplay2880x1800
4             IPSPanelRetinaDisplay2560x1600
                        ...                 
1298     IPSPanelFullHD/Touchscreen1920x1080
1299    IPSPanelQuadHD+/Touchscreen3200x1800
1300                                1366x768
1301                                1366x768
1302                                1366x768
Name: ScreenResolution, Length: 1303, dtype: object

In [24]:
#Cleaned "ScreenResolution" column
laptop_data_clean["ScreenResolution"] = laptop_data_clean["ScreenResolution"].astype(str)
laptop_data_clean["ScreenResolution"] = laptop_data_clean["ScreenResolution"].str[-9:]
laptop_data_clean["ScreenResolution"]

0       2560x1600
1        1440x900
2       1920x1080
3       2880x1800
4       2560x1600
          ...    
1298    1920x1080
1299    3200x1800
1300     1366x768
1301     1366x768
1302     1366x768
Name: ScreenResolution, Length: 1303, dtype: object

In [25]:
# Function to clean the Product column (remove item in brackets)
def clean_string(string, splitchar):
    cleaned_string = string.split(splitchar)[0].strip()
    return cleaned_string

#Removing the specs from the product name
laptop_data_clean['Product'] = laptop_data_clean['Product'].apply(lambda x: clean_string(x, '('))

#Removing the unit since all of them are GB, making it numerical instead of string
laptop_data_clean['Ram'] = laptop_data_clean['Ram'].apply(lambda x: clean_string(x, 'G'))

#Removing the unit since all of them are Kg, making it numerical instead of string
laptop_data_clean['Weight'] = laptop_data_clean['Weight'].apply(lambda x: clean_string(x, 'k'))

#Lambda 
# x = cleanstring(x,'(')

In [26]:
convertion_rate = 1.57

# Function to convert Euros to SGD
def convert_eur_to_sgd(amount_eur):
    Convert_euros_sgd = amount_eur * convertion_rate
    return round(Convert_euros_sgd, 2) 

# Apply the conversion function to the column
laptop_data_clean['Price_euros'] = laptop_data_clean['Price_euros'].apply(convert_eur_to_sgd)
laptop_data_clean = laptop_data_clean.rename(columns=({'Price_euros':'Price_sgd'}))

In [27]:
#Converting string to flow for the columns that are in char

laptop_data_clean['Ram'] = laptop_data_clean['Ram'].astype('int')
laptop_data_clean['Weight'] = laptop_data_clean['Weight'].astype('float')

In [28]:
# Define a function to extract processor information
def clean_cpu(cpu):
    # Regex to extract Intel Core i5 or i7 or other intel cpus
    intel_core_pattern = r'Intel Core i\d'
    intel_other_pattern = r'^Intel.+'
    amd_pattern = r'AMD'
    
    if re.search(intel_core_pattern, cpu):
        return re.search(intel_core_pattern, cpu).group()
    elif re.search(intel_other_pattern, cpu):
        return 'Intel Others'
    elif re.search(amd_pattern, cpu):
        return re.search(amd_pattern, cpu).group()
    else:
        return 'Others'  

laptop_data_clean['Cpu'] = laptop_data_clean['Cpu'].apply(clean_cpu)

In [29]:
# def filter_intel_gpu(gpu):
#     intel_pattern = r'\bIntel\b'
#     amd_pattern = r'\bAMD\b'
#     arm_pattern = r'\bARM\b'
#     nvidia_pattern = r'\bNvidia GeForce\b'
#     nvidia_gtx_pattern = r'\bNvidia GeForce GTX\b'
#     nvidia_quadro_pattern = r'\bNvidia Quadro\b'
    
#     if re.search(intel_pattern, gpu):
#         return 'Intel'
#     elif re.search(amd_pattern, gpu):
#         return 'AMD'
#     elif re.search(arm_pattern, gpu):
#         return 'ARM'
#     elif re.search(nvidia_gtx_pattern, gpu):
#         return 'Nvidia GeForce GTX'
#     elif re.search(nvidia_pattern, gpu):
#         return 'Nvidia GeForce'
#     elif re.search(nvidia_quadro_pattern, gpu):
#         return 'Nvidia Quadro'

# laptop_data_clean['Gpu'] = laptop_data_clean['Gpu'].apply(filter_intel_gpu)

In [30]:
laptop_data_clean.head()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_sgd
0,Apple,MacBook Pro,Ultrabook,13.3,2560x1600,Intel Core i5,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,2103.31
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,1411.34
2,HP,250 G6,Notebook,15.6,1920x1080,Intel Core i5,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,902.75
3,Apple,MacBook Pro,Ultrabook,15.4,2880x1800,Intel Core i7,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,3983.8
4,Apple,MacBook Pro,Ultrabook,13.3,2560x1600,Intel Core i5,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,2831.65


In [31]:
laptop_data_clean.to_csv('laptop_data_clean.csv', index=False)