## Analysing the Dataset laptop train dataset.

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset
lap_test = pd.read_csv('laptops_test.csv')


lap_train = pd.read_csv('laptops_train.csv')
# Train and the teach the model

### (EDA) Exploratory Data Analysis of Laptop Test Dataset

In [23]:
lap_test.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,"15.6""",1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0
1,Asus,Rog GL753VE-DS74,Gaming,"17.3""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0
2,Dell,Inspiron 7579,2 in 1 Convertible,"15.6""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0
3,Toshiba,Portege Z30-C-1CV,Notebook,"13.3""",Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0
4,Lenovo,IdeaPad 320-15ABR,Notebook,"15.6""",Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,10,2.2kg,4881708.0


In [24]:
lap_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              325 non-null    object 
 1   Model Name                325 non-null    object 
 2   Category                  325 non-null    object 
 3   Screen Size               325 non-null    object 
 4   Screen                    325 non-null    object 
 5   CPU                       325 non-null    object 
 6   RAM                       325 non-null    object 
 7    Storage                  325 non-null    object 
 8   GPU                       325 non-null    object 
 9   Operating System          325 non-null    object 
 10  Operating System Version  291 non-null    object 
 11  Weight                    325 non-null    object 
 12  Price                     325 non-null    float64
dtypes: float64(1), object(12)
memory usage: 33.1+ KB


In [25]:
lap_test.dtypes

Manufacturer                 object
Model Name                   object
Category                     object
Screen Size                  object
Screen                       object
CPU                          object
RAM                          object
 Storage                     object
GPU                          object
Operating System             object
Operating System Version     object
Weight                       object
Price                       float64
dtype: object

In [26]:
lap_test.nunique()

Manufacturer                 12
Model Name                  197
Category                      6
Screen Size                  10
Screen                       21
CPU                          50
RAM                           8
 Storage                     22
GPU                          54
Operating System              6
Operating System Version      3
Weight                       98
Price                       252
dtype: int64

In [27]:
lap_test.isnull().sum()

Manufacturer                 0
Model Name                   0
Category                     0
Screen Size                  0
Screen                       0
CPU                          0
RAM                          0
 Storage                     0
GPU                          0
Operating System             0
Operating System Version    34
Weight                       0
Price                        0
dtype: int64

In [28]:
# Fill null values with a default value
lap_test['Operating System Version'] = lap_test['Operating System Version'].fillna('Unknown')

# Verify the null values have been filled
lap_test.isnull().sum()


Manufacturer                0
Model Name                  0
Category                    0
Screen Size                 0
Screen                      0
CPU                         0
RAM                         0
 Storage                    0
GPU                         0
Operating System            0
Operating System Version    0
Weight                      0
Price                       0
dtype: int64

In [29]:
lap_test.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,"15.6""",1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0
1,Asus,Rog GL753VE-DS74,Gaming,"17.3""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0
2,Dell,Inspiron 7579,2 in 1 Convertible,"15.6""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0
3,Toshiba,Portege Z30-C-1CV,Notebook,"13.3""",Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0
4,Lenovo,IdeaPad 320-15ABR,Notebook,"15.6""",Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,10,2.2kg,4881708.0


In [30]:
# Clean the 'Weight' column by removing non-numeric characters and convert to float
lap_test['Weight'] = lap_test['Weight'].str.replace('kg', '').str.extract(r'(\d+\.\d+|\d+)').astype(float)

In [31]:
# Convert 'RAM' to numeric values
lap_test['RAM'] = lap_test['RAM'].astype(str).str.replace('GB', '').astype(int)

In [32]:
# Strip whitespace from column names to ensure consistency
lap_test.columns = lap_test.columns.str.strip()

In [33]:
# Ensure 'Storage' column exists and is properly named
if 'Storage' in lap_test.columns:
    # Convert 'RAM' to numeric values
    lap_test['RAM'] = lap_test['RAM'].astype(str).str.replace('GB', '').astype(int)

    # Function to convert storage to GB
    def convert_storage_to_gb(storage):
        storage = storage.replace('GB', '').replace('TB', '000')
        storage_values = storage.split('+')
        total_storage = sum(int(value.strip()) for value in storage_values if value.strip().isdigit())
        return total_storage

    # Apply the conversion function to the 'Storage' column
    lap_test['Storage'] = lap_test['Storage'].astype(str).apply(convert_storage_to_gb)

In [34]:
# Define features and target variable
X = lap_test.drop(['Price', 'Model Name'], axis=1)
y = lap_test['Price']


In [35]:
from sklearn.impute import SimpleImputer
# Preprocess the data
numeric_features = ['RAM', 'Storage', 'Weight']
categorical_features = ['Manufacturer', 'Category', 'Screen Size', 'Screen', 'CPU', 'GPU', 'Operating System', 'Operating System Version']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [36]:
import pickle

random_forest_model = pickle.load(open('random_forest_model.pkl', 'rb'))


In [37]:
y_test = lap_test['Price']
y_test

0       5148468.0
1      15552108.0
2      11550708.0
3      10625940.0
4       4881708.0
          ...    
320     5673096.0
321    13329108.0
322     2036268.0
323     6793488.0
324     3281148.0
Name: Price, Length: 325, dtype: float64

In [38]:
# x_test = lap_test.drop('Price')
X

Unnamed: 0,Manufacturer,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight
0,HP,Notebook,"15.6""",1366x768,Intel Core i7 7500U 2.7GHz,6,0,Intel HD Graphics 620,Windows,10,2.04
1,Asus,Gaming,"17.3""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16,0,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99
2,Dell,2 in 1 Convertible,"15.6""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12,0,Intel HD Graphics 620,Windows,10,2.19
3,Toshiba,Notebook,"13.3""",Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4,0,Intel HD Graphics 520,Windows,7,1.20
4,Lenovo,Notebook,"15.6""",Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6,0,AMD Radeon 530,Windows,10,2.20
...,...,...,...,...,...,...,...,...,...,...,...
320,Lenovo,2 in 1 Convertible,"14.0""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4,0,Intel HD Graphics 520,Windows,10,1.80
321,Lenovo,2 in 1 Convertible,"13.3""",IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16,0,Intel HD Graphics 520,Windows,10,1.30
322,Lenovo,Notebook,"14.0""",1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,0,Intel HD Graphics,Windows,10,1.50
323,HP,Notebook,"15.6""",1366x768,Intel Core i7 6500U 2.5GHz,6,0,AMD Radeon R5 M330,Windows,10,2.19


In [39]:
preprocessor = pickle.load(open('preprocessor.pkl', 'rb'))
x_process = preprocessor.transform(X)

In [40]:
pred = random_forest_model.predict(x_process)
pred


array([ 6305892.5124    , 13908214.32      ,  8750994.8136    ,
        8960112.72      ,  4576694.616     ,  5804841.354     ,
        8363340.96      ,  5383417.7592    ,  7224236.42772   ,
       11956122.8826    ,  5511651.0696    ,  8804244.22956   ,
       15565893.564     ,  4203640.6854    ,  2429246.3832    ,
        8772621.936     ,  8039104.2576    ,  8945488.344     ,
       12091377.4644    ,  3760091.06772   , 12135051.1152    ,
       12925776.6612    ,  9268529.3688    , 11181017.172     ,
        5322373.28999999,  7291971.7416    ,  8944751.79      ,
        4193468.0892    ,  9846035.1288    , 15089844.9312    ,
        9313340.3064    ,  6095454.4404    ,  8738867.05714286,
       11837252.7       ,  8006267.88      , 12884002.9344    ,
        5677349.6364    ,  8683669.33200001,  5716392.9264    ,
       18692501.8644    , 10599076.3788    , 11681020.26      ,
        4812463.6248    , 11662276.5168    , 11896641.4788    ,
       10307778.6084    ,  9978896.1324 

In [41]:
random_forest_results = {
    'Model': 'Random Forest',
    'Mean Squared Error': mean_squared_error(y_test, pred),
    'Mean Absolute Error': mean_absolute_error(y_test, pred),
    'R^2 Score': r2_score(y_test, pred)
}

print(random_forest_results)

{'Model': 'Random Forest', 'Mean Squared Error': 8351388135915.046, 'Mean Absolute Error': 1866906.35735054, 'R^2 Score': 0.7628332644819292}
