## Analysing the Dataset laptop train dataset.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


lap_train = pd.read_csv('laptops_train.csv')
# Train and the teach the model

### (EDA) Exploratory Data Analysis of Laptop Test Dataset

In [2]:
lap_train.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,16037611.2


In [3]:
lap_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    object 
 1   Model Name                977 non-null    object 
 2   Category                  977 non-null    object 
 3   Screen Size               977 non-null    object 
 4   Screen                    977 non-null    object 
 5   CPU                       977 non-null    object 
 6   RAM                       977 non-null    object 
 7    Storage                  977 non-null    object 
 8   GPU                       977 non-null    object 
 9   Operating System          977 non-null    object 
 10  Operating System Version  841 non-null    object 
 11  Weight                    977 non-null    object 
 12  Price                     977 non-null    float64
dtypes: float64(1), object(12)
memory usage: 99.4+ KB


In [4]:
lap_train.dtypes

Manufacturer                 object
Model Name                   object
Category                     object
Screen Size                  object
Screen                       object
CPU                          object
RAM                          object
 Storage                     object
GPU                          object
Operating System             object
Operating System Version     object
Weight                       object
Price                       float64
dtype: object

In [5]:
lap_train.nunique()

Manufacturer                 19
Model Name                  488
Category                      6
Screen Size                  18
Screen                       38
CPU                         106
RAM                           8
 Storage                     36
GPU                          98
Operating System              7
Operating System Version      4
Weight                      166
Price                       639
dtype: int64

In [6]:
lap_train.isnull().sum()

Manufacturer                  0
Model Name                    0
Category                      0
Screen Size                   0
Screen                        0
CPU                           0
RAM                           0
 Storage                      0
GPU                           0
Operating System              0
Operating System Version    136
Weight                        0
Price                         0
dtype: int64

In [7]:
# Fill null values with a default value
lap_train['Operating System Version'] = lap_train['Operating System Version'].fillna('Unknown')

# Verify the null values have been filled
lap_train.isnull().sum()


Manufacturer                0
Model Name                  0
Category                    0
Screen Size                 0
Screen                      0
CPU                         0
RAM                         0
 Storage                    0
GPU                         0
Operating System            0
Operating System Version    0
Weight                      0
Price                       0
dtype: int64

In [8]:
lap_train.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,Unknown,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,Unknown,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,Unknown,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,Unknown,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,Unknown,1.37kg,16037611.2


In [9]:
# Clean the 'Weight' column by removing non-numeric characters and convert to float
lap_train['Weight'] = lap_train['Weight'].str.replace('kg', '').str.extract(r'(\d+\.\d+|\d+)').astype(float)

In [10]:
# Convert 'RAM' to numeric values
lap_train['RAM'] = lap_train['RAM'].astype(str).str.replace('GB', '').astype(int)

In [11]:
# Strip whitespace from column names to ensure consistency
lap_train.columns = lap_train.columns.str.strip()

In [12]:
# Ensure 'Storage' column exists and is properly named
if 'Storage' in lap_train.columns:
    # Convert 'RAM' to numeric values
    lap_train['RAM'] = lap_train['RAM'].astype(str).str.replace('GB', '').astype(int)

    # Function to convert storage to GB
    def convert_storage_to_gb(storage):
        storage = storage.replace('GB', '').replace('TB', '000')
        storage_values = storage.split('+')
        total_storage = sum(int(value.strip()) for value in storage_values if value.strip().isdigit())
        return total_storage

    # Apply the conversion function to the 'Storage' column
    lap_train['Storage'] = lap_train['Storage'].astype(str).apply(convert_storage_to_gb)

In [13]:
# Define features and target variable
X = lap_train.drop(['Price', 'Model Name'], axis=1)
y = lap_train['Price']


In [14]:
from sklearn.impute import SimpleImputer
# Preprocess the data
numeric_features = ['RAM', 'Storage', 'Weight']
categorical_features = ['Manufacturer', 'Category', 'Screen Size', 'Screen', 'CPU', 'GPU', 'Operating System', 'Operating System Version']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


## Regression Models

In [15]:
# Fit the preprocessor to the data and transform it
X_preprocessed = preprocessor.fit_transform(X)

### K-Nearest Neighbors

In [16]:
from sklearn.neighbors import KNeighborsRegressor

# Train and evaluate KNN
knn_model = KNeighborsRegressor()
knn_model.fit(X_preprocessed, y)
y_pred_knn = knn_model.predict(X_preprocessed)

knn_results = {
    'Model': 'KNN',
    'Mean Squared Error': mean_squared_error(y, y_pred_knn),
    'Mean Absolute Error': mean_absolute_error(y, y_pred_knn),
    'R^2 Score': r2_score(y, y_pred_knn)
}

print(knn_results)


{'Model': 'KNN', 'Mean Squared Error': 5092063222672.665, 'Mean Absolute Error': 1375112.0317707267, 'R^2 Score': 0.8718344303561673}


### Lineal Regression

In [17]:
# Train and evaluate Linear Regression
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_preprocessed, y)
y_pred_lr = linear_regression_model.predict(X_preprocessed)

linear_regression_results = {
    'Model': 'Linear Regression',
    'Mean Squared Error': mean_squared_error(y, y_pred_lr),
    'Mean Absolute Error': mean_absolute_error(y, y_pred_lr),
    'R^2 Score': r2_score(y, y_pred_lr)
}

print(linear_regression_results)


{'Model': 'Linear Regression', 'Mean Squared Error': 3041503683489.974, 'Mean Absolute Error': 1203118.3826481747, 'R^2 Score': 0.9234463448072222}


### Random Forest

In [18]:
# Train and evaluate Random Forest
random_forest_model = RandomForestRegressor()
random_forest_model.fit(X_preprocessed, y)
y_pred_rf = random_forest_model.predict(X_preprocessed)

random_forest_results = {
    'Model': 'Random Forest',
    'Mean Squared Error': mean_squared_error(y, y_pred_rf),
    'Mean Absolute Error': mean_absolute_error(y, y_pred_rf),
    'R^2 Score': r2_score(y, y_pred_rf)
}

print(random_forest_results)


{'Model': 'Random Forest', 'Mean Squared Error': 1007089420347.268, 'Mean Absolute Error': 631486.2035689193, 'R^2 Score': 0.9746518879289684}


### Gradient Boosting

In [19]:
# Train and evaluate Gradient Boosting
gradient_boosting_model = GradientBoostingRegressor()
gradient_boosting_model.fit(X_preprocessed, y)
y_pred_gb = gradient_boosting_model.predict(X_preprocessed)

gradient_boosting_results = {
    'Model': 'Gradient Boosting',
    'Mean Squared Error': mean_squared_error(y, y_pred_gb),
    'Mean Absolute Error': mean_absolute_error(y, y_pred_gb),
    'R^2 Score': r2_score(y, y_pred_gb)
}

print(gradient_boosting_results)


{'Model': 'Gradient Boosting', 'Mean Squared Error': 4040249972207.7935, 'Mean Absolute Error': 1535678.5313377182, 'R^2 Score': 0.898308226636726}


In [20]:
import pickle
pickle.dump(random_forest_model, open('random_forest_model.pkl', 'wb'))

# random_forest_model = pickle.load(open('random_forest_model.pkl', 'rb'))


In [21]:
import pickle
pickle.dump(preprocessor, open('preprocessor.pkl', 'wb'))

# preprocessor = pickle.load(open('preprocessor.pkl', 'rb'))
