# Import Necessary modules

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, root_mean_squared_error, r2_score

# Import Dataset file

In [2]:
df = pd.read_csv('../data-sets/laptop_price.csv', encoding='ISO-8859-1')
print(df.head().to_string())

   laptop_ID Company      Product   TypeName  Inches                    ScreenResolution                         Cpu   Ram               Memory                           Gpu  OpSys  Weight  Price_euros
0          1   Apple  MacBook Pro  Ultrabook    13.3  IPS Panel Retina Display 2560x1600        Intel Core i5 2.3GHz   8GB            128GB SSD  Intel Iris Plus Graphics 640  macOS  1.37kg      1339.69
1          2   Apple  Macbook Air  Ultrabook    13.3                            1440x900        Intel Core i5 1.8GHz   8GB  128GB Flash Storage        Intel HD Graphics 6000  macOS  1.34kg       898.94
2          3      HP       250 G6   Notebook    15.6                   Full HD 1920x1080  Intel Core i5 7200U 2.5GHz   8GB            256GB SSD         Intel HD Graphics 620  No OS  1.86kg       575.00
3          4   Apple  MacBook Pro  Ultrabook    15.4  IPS Panel Retina Display 2880x1800        Intel Core i7 2.7GHz  16GB            512GB SSD            AMD Radeon Pro 455  macOS  1.83kg    

# Get the Info of Data

In [3]:
df.info()
df.isnull().sum()  # Check for missing values
df.dropna()
df.drop(labels=['laptop_ID', 'Product', 'Inches', 'OpSys', 'Company'], axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


# Change All Feature Data into Numerical Value using OneHotEncoder

In [4]:
# Initialize OneHotEncoder with the correct parameter
# Select the columns to encode
columns_to_encode = [ 'TypeName', 'Cpu', 'Memory', 'Gpu', 'Ram', 'Weight']
def map_to_replace(columns):
    # Iterate through each column in the list
    for column in columns:
        if column == 'Ram':
            df[column] = df[column].str.replace('GB', '')
            df[column] = df[column].astype(int)
        elif column == 'Weight':
            df[column] = df[column].str.replace('kg', '')
            df[column] = df[column].astype(float)
        elif(df[column].dtype == 'object'):
            unique_map = {} 
            count = 0
            for i in df[column].unique():
                unique_map[str(i)] = count 
                count += 1 
            df[column] = df[column].map(unique_map)

map_to_replace(columns_to_encode)
print(df.head().to_string())

   TypeName                    ScreenResolution  Cpu  Ram  Memory  Gpu  Weight  Price_euros
0         0  IPS Panel Retina Display 2560x1600    0    8       0    0    1.37      1339.69
1         0                            1440x900    1    8       1    1    1.34       898.94
2         1                   Full HD 1920x1080    2    8       2    2    1.86       575.00
3         0  IPS Panel Retina Display 2880x1800    3   16       3    3    1.83      2537.45
4         0  IPS Panel Retina Display 2560x1600    4    8       2    4    1.37      1803.60


In [5]:
# Ensure that the 'ScreenResolution' column is of string type
df['ScreenResolution'] = df['ScreenResolution'].astype(str)

# Split the 'ScreenResolution' into width and height
df[['Resolution_Width', 'Resolution_Height']] = df['ScreenResolution'].str.split('x', expand=True)

# Ensure that the 'Cpu' column is of string type
df['Cpu'] = df['Cpu'].astype(str)

# Extract CPU brand (e.g., Intel, AMD)
df['Cpu_Brand'] = df['Cpu'].apply(lambda x: x.split()[0])  

# Extract CPU speed and remove 'GHz'
df['Cpu_Speed'] = df['Cpu'].apply(lambda x: float(x.split()[-1].replace('GHz', '').strip()))
df.drop(labels='ScreenResolution', axis=1, inplace=True)
map_to_replace(np.array(['Resolution_Width']))
# View the updated DataFrame
print(df.head())


   TypeName Cpu  Ram  Memory  Gpu  Weight  Price_euros  Resolution_Width  \
0         0   0    8       0    0    1.37      1339.69                 0   
1         0   1    8       1    1    1.34       898.94                 1   
2         1   2    8       2    2    1.86       575.00                 2   
3         0   3   16       3    3    1.83      2537.45                 3   
4         0   4    8       2    4    1.37      1803.60                 0   

  Resolution_Height Cpu_Brand  Cpu_Speed  
0              1600         0        0.0  
1               900         1        1.0  
2              1080         2        2.0  
3              1800         3        3.0  
4              1600         4        4.0  


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[[ 'Weight', 'Resolution_Width', 'Resolution_Height']] = scaler.fit_transform(df[['Weight', 'Resolution_Width', 'Resolution_Height']])


In [7]:
df[['Gpu', 'Price_euros']].corr()


Unnamed: 0,Gpu,Price_euros
Gpu,1.0,0.139455
Price_euros,0.139455,1.0


In [8]:
X = df.drop(labels="Price_euros", axis=1)
y = df['Price_euros']
print(X.shape, y.shape)

(1303, 10) (1303,)


In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(x_test.shape, x_train.shape, y_test.shape, y_train.shape)

(391, 10) (912, 10) (391,) (912,)


In [10]:
model = LinearRegression()
model.fit(x_train, y_train)
print(f'Coeff : {model.coef_} & intercept : {model.intercept_}')

Coeff : [ 68.51400352   0.52409362  88.85428612  -8.69040163  -0.77216897
 -34.75609229 -27.05793726 201.8538863    0.52409362   0.52409362] & intercept : 296.80727407267807


In [11]:
y_pred = model.predict(x_test)

In [12]:
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2Score = r2_score(y_test, y_pred)

print(f'MSE : {mse} & RMSE : {rmse} & R2 SCORE : {r2Score}')

MSE : 199659.2686884051 & RMSE : 446.8324839225603 & R2 SCORE : 0.6271395459886031
