Here, we will try to predict prices, data has been explored already 

In [276]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import yeojohnson
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import shapiro
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [277]:
df = pd.read_csv('laptopData.csv')
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
print(df.shape)

(1273, 12)


Column Selector

In [278]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.columns = cols
    def fit(self, X, y=None):
        return self 
    def transform(self, X, y=None):
        X = X[self.columns]
        return X 
    def fit_transform(self, X, y = None):
        return self.transform(X)
        

Fixing data type :

In [279]:
class DataTypeFixer(BaseEstimator, TransformerMixin):
    def __init__(self, cols =None):
        self.cols = cols 
    def fit(self, X , y=None):
        return self 
    def transform(self, X, y=None):
        X_copy = X.copy()
        for col in self.cols:
            if col == "Inches":
                X_copy['Inches']= pd.to_numeric(X_copy['Inches'], errors='coerce').astype('float32')
                X_copy['Inches']=X_copy['Inches'].fillna(X_copy['Inches'].mean())
                X_copy['Inches']=X_copy['Inches'].apply(lambda x : float(x/2.54) if x > 17.5 else x)
            elif col == "Ram" :
                X_copy['Ram']= pd.to_numeric(X_copy['Ram'].str.replace('GB','').str.strip(), errors='coerce')
                X_copy['Ram']= X_copy['Ram'].fillna(X_copy['Ram'].mean()).astype('int32')
            elif col == "Weight":
                X_copy['Weight']= pd.to_numeric(X_copy['Weight'].str.replace('kg','').str.strip(),errors='coerce').astype('float32')
                X_copy['Weight'].fillna(X_copy['Weight'].mean(), inplace=True)
        return X_copy
    def fit_transform(self, X, y = None):
        return self.transform(X)


Feature Extraction

In [280]:
class FeatureExtract(BaseEstimator, TransformerMixin):
    def __init__(self, cols =None):
        self.cols = cols 
    def fit(self, X , y=None):
        return self 
    
    def extract_memory_feat(self, df):
        import re
        import numpy as np
        # Helper function to extract storage sizes
        def parse_storage(mem_str):
            ssd = hdd = flash = 0
            if not isinstance(mem_str, str):
                return ssd, hdd, flash
            # Replace TB with 1000GB for easier parsing
            mem_str = mem_str.replace('TB', '000GB')
            # Find all (size, type) pairs
            matches = re.findall(r'(\d+)GB\s*(SSD|HDD|Flash|Flash Storage)', mem_str)
            for size, stype in matches:
                size = int(size)
                if 'SSD' in stype:
                    ssd += size
                elif 'HDD' in stype:
                    hdd += size
                elif 'Flash' in stype:
                    flash += size
            return ssd, hdd, flash
        
        df['SSD'], df['HDD'], df['Flash'] = zip(*df['Memory'].map(parse_storage))
        return df
    
    # ...existing code for other extractors...
    def fetch_processor(self, text):
        if text in ['Intel Core i7', 'Intel Core i5', 'Intel Core i3']:
            return text
        elif isinstance(text, str) and text.split()[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'
    
    def extract_cpu_feat(self, df):
        cpu_speed_pattern = r'\b\d+(?:\.\d+)?(?:GHz|Hz)\b'
        df['CPU speed']= pd.to_numeric(df['Cpu'].str.findall(cpu_speed_pattern).str.get(0).str.split('G').str.get(0), errors='coerce')
        # Ensure column is string type before applying split
        df['Cpu'] = df['Cpu'].astype(str)
        df['Cpu Name'] = df['Cpu'].apply(lambda x: " ".join(x.split()[0:3]) if isinstance(x, str) else "Unknown")
        df['CPU type'] = df['Cpu Name'].apply(self.fetch_processor)
        df.drop(columns=[ 'Cpu Name'], inplace= True)
        return df 

    def extract_resolution_feat(self,df):
        pattern = r'(\d{3,4}x\d{2,4})'
        res = df['ScreenResolution'].str.extract(pattern)
        res_splt = res[0].str.split('x', n=1, expand=True)
        res_splt.columns = ["X res", "Y res"]
        df["X res"]= pd.to_numeric(res_splt['X res'], errors='coerce')
        df["X res"]= df["X res"].fillna(df["X res"].mean()).astype('int32')
        df["Y res"]= pd.to_numeric(res_splt['Y res'], errors='coerce')
        df["Y res"]= df["Y res"].fillna(df["Y res"].mean()).astype('int32')
        df['IPS']= df['ScreenResolution'].apply(lambda x : 1 if 'IPS' in x else 0)
        df['Touchscreen']= df['ScreenResolution'].apply(lambda x : 1 if 'Touchscreen' in x else 0)
        df['PPI']= (np.sqrt((df['X res']**2)+(df['Y res']**2))/df['Inches'])
        df.drop(columns=['X res', 'Y res'], inplace=True)
        return df 

    def fetch_OS(self , text):
        if text in ['macOS', 'Mac OS X']:
            return "Mac"
        elif 'Windows' in text :
            return 'Windows'
        else :
            return "Others"
     
    def extract_OS(self, df):
        df["Operating System"]= df['OpSys'].apply(self.fetch_OS)
        return df 
    
    def extract_gpu(self , df):
        df['Gpu'] = df['Gpu'].apply(lambda x: x.split()[0] if isinstance(x, str) else "Unknown")
        return df
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        for col in self.cols :
            if col == "Gpu":
                X_copy = self.extract_gpu(X_copy)
            elif col == "Memory":
                X_copy = self.extract_memory_feat(X_copy)
            elif col == "OpSys":
                X_copy = self.extract_OS(X_copy)
            elif col == "ScreenResolution":
                X_copy = self.extract_resolution_feat(X_copy)
            elif col == "Cpu":
                X_copy = self.extract_cpu_feat(X_copy)
        return X_copy

    def fit_transform(self, X, y = None):
        return self.transform(X)


Skew Fixer :

In [281]:
class SkewFixer(BaseEstimator, TransformerMixin):
    def __init__(self, cols =None, skew_threshold_left = -1, skew_threshold_right = 1):
        self.cols = cols 
        self.skew_threshold_left = skew_threshold_left 
        self.skew_threshold_right = skew_threshold_right
        self.skewed = []
    def fit(self, X , y=None):
        if self.cols is None :
            self.cols = X.select_dtypes(include=[np.number]).columns
        for col in self.cols:
            sk = X[col].skew()
            if sk > self.skew_threshold_right or sk < self.skew_threshold_left:
                self.skewed.append(col)

        return self ; 

        
    def transform(self, X, y=None):
        X_copy = X.copy()
        for col in self.skewed :
            X_copy[col], _ = yeojohnson(X_copy[col])
        return X_copy

        
    def fit_transform(self, X, y = None):
        return self.transform(X)


Company grouping :

In [282]:
# Merge sub-brands
df['Company'] = df['Company'].replace({'Vero': 'Acer'})

# Group companies with < 5 entries into 'Other'
company_counts = df['Company'].value_counts()
rare_companies = company_counts[company_counts < 5].index
df['Company'] = df['Company'].replace(rare_companies, 'Other')

Scaling :


In [283]:
class Scaler(BaseEstimator, TransformerMixin):
    def __init__(self, cols ):
        self.cols = cols
        self.scalers={} 
    
    def fit(self , X , y=None):
        for col in self.cols :
            stat , p = shapiro(X[col])
            if p > 0.05:
                scaler = StandardScaler()
            else :
                scaler = MinMaxScaler()
            scaler.fit(X[[col]])  # <-- Ensure DataFrame input
            self.scalers[col]=scaler 
        return self 
    
    def transform(self , X , y=None):
        X_copy = X.copy()
        for col in self.cols :
            X_copy[col]= self.scalers[col].transform(X_copy[[col]]).flatten()  # <-- Ensure DataFrame input
        return X_copy
    
    def fit_transform(self, X, y = None):
        self.fit(X, y)
        return self.transform(X)


One hot Encoding the categorical columns :

In [284]:
class OneHot(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        self.encoder = None 
        self.columnNames = None

    def fit(self , X , y=None):
        self.encoder = OneHotEncoder(sparse_output=False, drop='first')
        self.encoder.fit(X[self.cols])
        self.column_names = self.encoder.get_feature_names_out(self.cols)
        return self
    
    def transform(self , X , y=None):
        X_copy = X.copy()
        encoded = self.encoder.transform(X_copy[self.cols])
        encodedDF = pd.DataFrame(encoded, columns=self.column_names, index=X_copy.index)
        X_copy.drop(columns=self.cols, inplace=True)

        X_copy= pd.concat([X_copy, encodedDF], axis=1)

        return X_copy
    
    def fit_transform(self, X, y = None):
        self.fit(X, y)
        return self.transform(X)


Dropper :

In [285]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, cols=None):
        self.cols = cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.cols is None:
            return X
        else:
            return X.drop(self.cols,axis=1)
       
    def fit_transform(self, X, y = None):
        self.fit(X, y)
        return self.transform(X)

In [286]:
class Full_pipeline:
    def __init__(self):
        self.allcols= ['Company', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram',
                         'Memory', 'Gpu', 'OpSys', 'Weight']
        
        # Remove 'Gpu' from drop_cols to avoid KeyError
        self.drop_cols = ['ScreenResolution', 'Cpu', 'Memory', 'OpSys', 'Inches']
        
        self.encode_cols = ['TypeName', 'Company', 'CPU type', 'Gpu', "Operating System"]
        scaling_features = [
            'Inches',
            'Weight',
            'Ram',
            'PPI',
            'HDD',
            'SSD',
            'Flash',
            'CPU speed']
        
        self.full_pipeline= Pipeline([
            ('Column selector', ColumnSelector(cols=self.allcols)),
            ("fix", DataTypeFixer(cols=['Inches', 'Weight', 'Ram'])),
            ("Extract", FeatureExtract(cols=['Gpu', 'Cpu', 'Memory', 'ScreenResolution','OpSys'])),
            ("fix skew",SkewFixer(cols=['PPI', 'Weight', 'Ram','HDD', 'SSD',"Flash",'CPU speed' ])),
            ('onehot', OneHot(cols=self.encode_cols)),
            ("scale", Scaler(cols=scaling_features )),
            ('dropper', DropColumnsTransformer(cols=self.drop_cols))
        ])
        
        self.y_pipeline = Pipeline([
            ('selector', ColumnSelector(cols=['Price'])),
            ('power_transformation', SkewFixer(cols=['Price'])),
            ('scaling', Scaler(cols=['Price']))
        ])
    
    def fit_transform(self, X_train, y_train):
        X_train = self.full_pipeline.fit_transform(X_train)
        y_train = self.y_pipeline.fit_transform(y_train)
        return X_train, y_train
    
    def transform(self, X_test, y_test):
        X_test = self.full_pipeline.transform(X_test)
        y_test = self.y_pipeline.transform(y_test)
        return X_test, y_test

Now we split and preprocess the data !!

In [287]:
X = df.drop(columns=['Price'])
Y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [288]:
# Show sample Memory values before feature extraction
print(X_train['Memory'].head(10))
full_pipeline1 = Full_pipeline()
y_train_df = pd.DataFrame(y_train)
X_train_pre, y_train_pre = full_pipeline1.fit_transform(X_train, y_train_df)
# Show SSD, HDD, Flash after feature extraction
print(X_train_pre[['SSD', 'HDD', 'Flash']].head(10))

950                  8GB SSD
1121               256GB SSD
1292               500GB HDD
1247    256GB SSD +  1TB HDD
306                256GB SSD
838       16GB Flash Storage
863                  1TB HDD
1229    128GB SSD +  1TB HDD
1029               256GB SSD
361                256GB SSD
Name: Memory, dtype: object


           SSD   HDD    Flash
950   0.007812  0.00  0.00000
1121  0.250000  0.00  0.00000
1292  0.000000  0.25  0.00000
1247  0.250000  0.50  0.00000
306   0.250000  0.00  0.00000
838   0.000000  0.00  0.03125
863   0.000000  0.50  0.00000
1229  0.125000  0.50  0.00000
1029  0.250000  0.00  0.00000
361   0.250000  0.00  0.00000


In [289]:
print(X_train_pre.describe())


               Ram       Weight    CPU speed          SSD          HDD  \
count  1082.000000  1082.000000  1082.000000  1082.000000  1082.000000   
mean      0.119194     0.133377     0.518847     0.178424     0.205928   
std       0.090614     0.077881     0.187059     0.183263     0.257735   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.047619     0.081652     0.407407     0.000000     0.000000   
50%       0.111111     0.129683     0.592593     0.150391     0.000000   
75%       0.111111     0.155379     0.666667     0.250000     0.500000   
max       1.000000     1.000000     1.000000     1.000000     1.000000   

             Flash          IPS  Touchscreen          PPI  TypeName_Gaming  \
count  1082.000000  1082.000000  1082.000000  1082.000000      1082.000000   
mean      0.009993     0.284658     0.144177     0.214326         0.160813   
std       0.064361     0.451460     0.351432     0.165118         0.367529   
min       0.000000   

Data is cleaned , Now Need to make the models 