In [199]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_columns', None)

In [200]:
hp = pd.read_json('hp.json')
dell = pd.read_json('dell.json')
lenovo = pd.read_json('lenovo.json')
acer = pd.read_json('acer.json')
asus = pd.read_json('asus.json')
msi = pd.read_json('msi.json')

In [201]:
df = pd.concat([hp,dell,lenovo,acer,asus,msi],axis=0,ignore_index=True)

In [202]:
df.reset_index(inplace=True)
df.drop(['index'],axis=1,inplace=True)

In [203]:
df.head(5)

Unnamed: 0,brand,processor_brand,processor_name,ram,ram_type,ssd,clock,gpu,gpu_memory,price
0,SAMSUNG,Intel,Core i7,16 GB,LPDDR4X,512 GB,Max Turbo Frequency up to 5.0 GHz,Intel Integrated Iris Xe,0,"₹86,990"
1,SAMSUNG,Intel,Core i5,8 GB,LPDDR4X,512 GB,Max Turbo Frequency up to 4.6 GHz,Intel Integrated Iris Xe,0,"₹71,990"
2,HP,AMD,Athlon Dual Core,8 GB,DDR4,512 GB,2.3 GHz with Turbo Boost Upto 3.2 GHz,AMD Radeon AMD,0,"₹26,990"
3,HP,Intel,Core i3,8 GB,DDR4,512 GB,"Base Frequency 1.70 Ghz, Max Turbo Boost Frequ...",Intel Integrated UHD,0,"₹35,990"
4,HP,Intel,Core i3,8 GB,DDR4,512 GB,0.9 GHz upto max turbo frequency at 4.4 Ghz,Intel Integrated UHD,0,"₹37,990"


In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1082 entries, 0 to 1081
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   brand            1082 non-null   object
 1   processor_brand  1082 non-null   object
 2   processor_name   1082 non-null   object
 3   ram              1082 non-null   object
 4   ram_type         1082 non-null   object
 5   ssd              1082 non-null   object
 6   clock            1082 non-null   object
 7   gpu              1082 non-null   object
 8   gpu_memory       1082 non-null   object
 9   price            1082 non-null   object
dtypes: object(10)
memory usage: 84.7+ KB


In [205]:
df['processor_brand'].unique()

array(['Intel', 'AMD', 'MediaTek', 'ARM'], dtype=object)

In [206]:
df.groupby('processor_brand').size()

processor_brand
AMD         304
ARM           1
Intel       776
MediaTek      1
dtype: int64

In [207]:
df = df[df['processor_brand']!='ARM']
df = df[df['processor_brand']!='MediaTek']

In [208]:
df['processor_brand'].unique()

array(['Intel', 'AMD'], dtype=object)

In [209]:
df.reset_index(inplace=True)
df.drop(['index'],axis=1,inplace=True)

In [210]:
df.groupby('ram_type').size()

ram_type
DDR          3
DDR3         6
DDR4       731
DDR4X        2
DDR5       174
LPDDR3       1
LPDDR4X     52
LPDDR5     111
dtype: int64

In [211]:
ddr = np.where(df['ram_type']=='DDR')[0].tolist()
ddr3 = np.where(df['ram_type']=='DDR3')[0].tolist()
ddr4x = np.where(df['ram_type']=='DDR4X')[0].tolist()
lpddr3 = np.where(df['ram_type']=='LPDDR3')[0].tolist()
index = ddr+ddr3+ddr4x+lpddr3
index

[862, 961, 1014, 164, 356, 709, 717, 723, 749, 71, 679, 148]

In [212]:
df.drop(index,axis=0,inplace=True)
df.reset_index(inplace=True)
df.drop(['index'],axis=1,inplace=True)

In [213]:
df['ram_type'].unique()

array(['LPDDR4X', 'DDR4', 'LPDDR5', 'DDR5'], dtype=object)

In [214]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   brand            1068 non-null   object
 1   processor_brand  1068 non-null   object
 2   processor_name   1068 non-null   object
 3   ram              1068 non-null   object
 4   ram_type         1068 non-null   object
 5   ssd              1068 non-null   object
 6   clock            1068 non-null   object
 7   gpu              1068 non-null   object
 8   gpu_memory       1068 non-null   object
 9   price            1068 non-null   object
dtypes: object(10)
memory usage: 83.6+ KB


In [215]:
df.groupby('processor_name').size()

processor_name
Athlon Dual Core       14
Celeron Dual Core      16
Core i3               148
Core i5               369
Core i7               201
Core i9                32
Dual Core               1
Pentium Quad Core       1
Ryzen 3 Dual Core      18
Ryzen 3 Hexa Core       1
Ryzen 3 Quad Core      21
Ryzen 5                 1
Ryzen 5 Dual Core       2
Ryzen 5 Hexa Core     120
Ryzen 5 Quad Core      27
Ryzen 7 Hexa Core       1
Ryzen 7 Octa Core      80
Ryzen 7 Quad Core       1
Ryzen 9 16 Core         1
Ryzen 9 Octa Core      11
Ryzen Z1 HexaCore       1
Ryzen Z1 Octa Core      1
dtype: int64

In [216]:
processors = ['Core i3','Core i5','Core i7','Core i9','Ryzen 3 Dual Core','Ryzen 3 Quad Core',
              'Ryzen 5 Hexa Core','Ryzen 5 Quad Core','Ryzen 7 Octa Core','Ryzen 9 Octa Core']

p = np.where(df['processor_name'].isin(processors))

In [217]:
data = df.iloc[p[0].tolist()].reset_index().drop(['index'],axis=1)
data['processor_name'].unique()

array(['Core i7', 'Core i5', 'Core i3', 'Ryzen 5 Hexa Core',
       'Ryzen 3 Dual Core', 'Ryzen 3 Quad Core', 'Ryzen 7 Octa Core',
       'Ryzen 5 Quad Core', 'Core i9', 'Ryzen 9 Octa Core'], dtype=object)

In [218]:
data.head(5)

Unnamed: 0,brand,processor_brand,processor_name,ram,ram_type,ssd,clock,gpu,gpu_memory,price
0,SAMSUNG,Intel,Core i7,16 GB,LPDDR4X,512 GB,Max Turbo Frequency up to 5.0 GHz,Intel Integrated Iris Xe,0,"₹86,990"
1,SAMSUNG,Intel,Core i5,8 GB,LPDDR4X,512 GB,Max Turbo Frequency up to 4.6 GHz,Intel Integrated Iris Xe,0,"₹71,990"
2,HP,Intel,Core i3,8 GB,DDR4,512 GB,"Base Frequency 1.70 Ghz, Max Turbo Boost Frequ...",Intel Integrated UHD,0,"₹35,990"
3,HP,Intel,Core i3,8 GB,DDR4,512 GB,0.9 GHz upto max turbo frequency at 4.4 Ghz,Intel Integrated UHD,0,"₹37,990"
4,HP,AMD,Ryzen 5 Hexa Core,16 GB,DDR4,512 GB,Up to 4.0 GHz max boost clock,AMD Radeon Radeon Graphics,0,"₹41,990"


In [219]:
gpus = list(zip(data.groupby('gpu').size().index,data.groupby('gpu').size()))

In [220]:
gpu = [x[0] for x in gpus if x[1]>=5]

In [221]:
gpu

['AMD Radeon',
 'AMD Radeon AMD',
 'AMD Radeon Radeon',
 'AMD Radeon Radeon Graphics',
 'Intel Integrated Integrated',
 'Intel Integrated Iris Xe',
 'Intel Integrated Iris Xe Graphics',
 'Intel Integrated UHD',
 'Intel Integrated UHD Graphics',
 'NA',
 'NVIDIA GeForce GTX 1650',
 'NVIDIA GeForce MX350',
 'NVIDIA GeForce MX550',
 'NVIDIA GeForce RTX 2050',
 'NVIDIA GeForce RTX 3050',
 'NVIDIA GeForce RTX 3050 Ti',
 'NVIDIA GeForce RTX 3060',
 'NVIDIA GeForce RTX 3070',
 'NVIDIA GeForce RTX 3070 Ti',
 'NVIDIA GeForce RTX 4050',
 'NVIDIA GeForce RTX 4060',
 'NVIDIA GeForce RTX 4070']

In [222]:
mask = ['AMD Radeon', 'AMD Radeon AMD', 'AMD Radeon Radeon', 'AMD Radeon Radeon Graphics', 'Intel Integrated Integrated',
 'Intel Integrated Iris Xe', 'Intel Integrated Iris Xe Graphics', 'Intel Integrated UHD', 'Intel Integrated UHD Graphics',
 'NVIDIA GeForce GTX 1650', 'NVIDIA GeForce MX350', 'NVIDIA GeForce MX550', 'NVIDIA GeForce RTX 2050', 'NVIDIA GeForce RTX 3050',
 'NVIDIA GeForce RTX 3050 Ti', 'NVIDIA GeForce RTX 3060', 'NVIDIA GeForce RTX 3070', 'NVIDIA GeForce RTX 3070 Ti',
 'NVIDIA GeForce RTX 4050', 'NVIDIA GeForce RTX 4060', 'NVIDIA GeForce RTX 4070']

g = np.where(data['gpu'].isin(mask))
len(g[0].tolist())

910

In [223]:
data = data.iloc[g[0].tolist()].reset_index().drop(['index'],axis=1)

In [224]:
data['gpu'].unique()

array(['Intel Integrated Iris Xe', 'Intel Integrated UHD',
       'AMD Radeon Radeon Graphics', 'AMD Radeon AMD',
       'Intel Integrated Integrated', 'NVIDIA GeForce RTX 3050',
       'NVIDIA GeForce RTX 2050', 'NVIDIA GeForce GTX 1650',
       'Intel Integrated UHD Graphics', 'NVIDIA GeForce RTX 4060',
       'AMD Radeon Radeon', 'NVIDIA GeForce RTX 4050',
       'Intel Integrated Iris Xe Graphics', 'NVIDIA GeForce RTX 3060',
       'NVIDIA GeForce RTX 4070', 'NVIDIA GeForce RTX 3050 Ti',
       'NVIDIA GeForce RTX 3070', 'NVIDIA GeForce RTX 3070 Ti',
       'NVIDIA GeForce MX550', 'NVIDIA GeForce MX350', 'AMD Radeon'],
      dtype=object)

In [225]:
data.sample(5)

Unnamed: 0,brand,processor_brand,processor_name,ram,ram_type,ssd,clock,gpu,gpu_memory,price
684,ASUS,AMD,Ryzen 3 Quad Core,8 GB,LPDDR5,512 GB,2.4 GHz upto max turbo frequency at 4.1 Ghz,AMD Radeon AMD,0,"₹33,990"
656,ASUS,Intel,Core i5,16 GB,DDR4,512 GB,4.5,Intel Integrated Iris Xe Graphics,0,"₹67,990"
73,HP,AMD,Ryzen 5 Hexa Core,16 GB,DDR4,512 GB,4.3,AMD Radeon Radeon Graphics,0,"₹59,100"
483,Lenovo,AMD,Ryzen 7 Octa Core,16 GB,DDR5,1 TB,"Base Frequency 3.6 GHz, Max Turbo Boost at 5.1...",NVIDIA GeForce RTX 4070,8 GB,"₹1,62,990"
688,Lenovo,AMD,Ryzen 5 Hexa Core,16 GB,LPDDR4X,512 GB,"Base Frequency 2.0 GHz, Max Turbo Boost at 4.5...",AMD Radeon AMD,0,"₹64,990"


In [226]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 910 entries, 0 to 909
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   brand            910 non-null    object
 1   processor_brand  910 non-null    object
 2   processor_name   910 non-null    object
 3   ram              910 non-null    object
 4   ram_type         910 non-null    object
 5   ssd              910 non-null    object
 6   clock            910 non-null    object
 7   gpu              910 non-null    object
 8   gpu_memory       910 non-null    object
 9   price            910 non-null    object
dtypes: object(10)
memory usage: 71.2+ KB


In [227]:
price = []
for p in data['price']:
    
    a = int(''.join(p[1:].split(',')))
    price.append(a)
    
len(price)

910

In [228]:
gm = []

for g in data['gpu_memory']:
    
    if(g == 0):
        gm.append(0)
    else:
        gm.append(int(g.split('GB')[0].strip()))
len(gm)

910

In [229]:
clock=[]

for c in data['clock'].tolist():
    
    try:
        
        a = c.split('GHz')[-2].strip().split()[-1]
        
    except:
        
        a = c.split('Ghz')[0].strip().split()[-1]
        
    clock.append(float(a))
    
len(clock)

910

In [230]:
ssd=[]

for s in data['ssd']:
    
    if 'GB' in s:
        
        ssd.append(int(s.split('GB')[0].strip())/1000)
        
    elif "TB" in s:
        
        ssd.append(int(s.split('TB')[0].strip()))
        
len(ssd)

910

In [231]:
ram=[]

for r in data['ram']:
    
    ram.append(int(r.split('GB')[0].strip()))
len(ram)

910

In [232]:
final = data.copy()
final['ram'] = ram
final['gpu_memory'] = gm
final['ssd'] = ssd
final['clock'] = clock
final['price'] = price

In [233]:
final.sample(5)

Unnamed: 0,brand,processor_brand,processor_name,ram,ram_type,ssd,clock,gpu,gpu_memory,price
39,HP,Intel,Core i3,8,DDR4,0.512,1.7,Intel Integrated UHD,0,54070
339,ASUS,Intel,Core i5,16,DDR5,1.0,4.6,NVIDIA GeForce RTX 3050,6,94990
214,MSI,Intel,Core i7,8,DDR4,0.512,3.5,Intel Integrated Iris Xe,0,60990
661,MSI,AMD,Ryzen 5 Hexa Core,8,DDR4,0.512,2.0,AMD Radeon AMD,0,35990
821,MSI,Intel,Core i7,16,DDR4,0.512,4.6,NVIDIA GeForce RTX 2050,4,61990


In [234]:
final.to_json('laptops.json')