In [362]:
import numpy as np
import pandas as pd
import re

pd. set_option('display.max_columns', None)


# read json file in pandas
data = pd.read_json('raw_data.json')

def print_all(col_name) :
    all = data[col_name]

    for a in all:
        print(a,end="  |  ")


In [363]:
# drop all columns that has No of null values more than 100
# data.dropna(thresh=250, axis=1, inplace=True)

In [364]:
drop_col = ['Link','Name','Brand','Dimensions','Material','Style','3.5mm Jack','SMS','MMS','Loudspeaker',
            'MicroSD Slot','Notification Light','Made in', 'Sar Value', 'Cover Display', 'Cover Camera',
            'Talk-time','Flashlight', 'Phonebook', 'Call Records', 'Internet','Java', 'Games','Special Features', 
            'Special Feature','OS','SIM','WLAN','GPS','Radio','Weight','Size','Features']
data.drop(drop_col, axis=1, inplace=True)

# all the column names 
data.columns

Index(['Official ✭', 'First Release', 'Colors', 'Network', 'Bluetooth', 'USB',
       'OTG', 'USB Type-C', 'NFC', 'Water Resistance', 'Resolution',
       'Technology', 'Protection', 'Video Recording', 'Type and Capacity',
       'Fast Charging', 'Reverse Charge', 'Operating System', 'Chipset', 'RAM',
       'Processor', 'GPU', 'ROM', 'Fingerprint', 'Face Unlock', 'Sensors',
       'More Features', 'Manufactured by', 'Wireless Charging',
       'Reverse Charging', 'More', 'Back', 'Front', 'Stand-by', 'Audio Format',
       'Video Format', 'CPU', 'Video Playback', 'External Slot', 'Color',
       'Infrared', 'Others', 'Variant', 'Version', 'Other Features',
       'First Release:'],
      dtype='object')

### pre v0

In [365]:
# preprocess official column and extract price value.
# Therefore convert into price range

# insert a new column
data.insert(0, 'PriceRange', np.nan)
for index, row in data.iterrows():
    line = row['Official ✭']
    # split the line into words
    words = line.split()
    if(len(words)>0): 
        price = words[0].replace('৳', '').replace(',', '')
    price = int(price)
    if price <= 10000:    
        data.at[index, 'PriceRange'] = 0 # low
    elif price <= 25000:
        data.at[index, 'PriceRange'] = 1 # lower mid
    elif price <= 40000:
        data.at[index, 'PriceRange'] = 2 # mid
    elif price <= 60000:
        data.at[index, 'PriceRange'] = 3 # higher mid
    else:
        data.at[index, 'PriceRange'] = 4 # high

# drop the old column
data.drop('Official ✭', axis=1, inplace=True)


In [366]:
# First Release

# insert a new column
data.insert(1, 'ReleaseYear', np.nan)

for index, row in data.iterrows():
    line = row['First Release']
    year = 2022
    if isinstance(line, str):
        regex = re.compile(r'\d\d\d\d')
        year = regex.search(line)
        year = int(year.group())
    
    data.at[index, 'ReleaseYear'] = year

# drop the old column
data.drop('First Release', axis=1, inplace=True)

In [367]:
# Number of available color variants

# insert a new column
data.insert(2, 'AvailableColors', np.nan)

for index, row in data.iterrows():
    line = row['Colors']
    temp = 1
    if isinstance(line,str):
        words = line.split(',')
        temp = len(words)
    data.at[index, 'AvailableColors'] = temp   

# drop the old column
data.drop('Colors', axis=1, inplace=True)


In [368]:
# RAM in GB

# insert a new column
data.insert(3, 'Ram_GB', np.nan)

for index, row in data.iterrows():
    line = row['RAM']
    temp = 3
    if isinstance(line,str):
        rams = [int(s) for s in line.split() if s.isdigit()]
        temp = max(rams) if len(rams) > 0 else 3
    data.at[index, 'Ram_GB'] = temp   

# drop the old column
data.drop('RAM', axis=1, inplace=True)


In [369]:
# Network support

# insert a new column
data.insert(4, 'NetworkSupport', np.nan)

for index, row in data.iterrows():
    line = row['Network']
    temp = 1
    if isinstance(line, str):
        regex = re.compile(r'\dG') 
        # count no of occurance of G
        temp = len(regex.findall(line))
    
    data.at[index, 'NetworkSupport'] = temp

# drop the old column
data.drop('Network', axis=1, inplace=True) 
    


In [370]:
# Bluetooth

data.insert(5, 'BluetoothVersion', np.nan)

for index, row in data.iterrows():
    line = row['Bluetooth']
    temp = 4.0
    if isinstance(line, str):
        regex = re.compile(r'\d.\d')
        all = regex.search(line)
        if all:
            temp = float(all.group())

    data.at[index, 'BluetoothVersion'] = temp

# drop the old column
data.drop('Bluetooth', axis=1, inplace=True) 

# USB

data.insert(8, 'USB_Version', np.nan)

for index, row in data.iterrows():
    line = row['USB']
    temp = 2.0
    if isinstance(line, str):
        regex = re.compile(r'\d.\d')
        all = regex.search(line)
        if all:
            temp = float(all.group())

    data.at[index, 'USB_Version'] = temp

# drop the old column
data.drop('USB', axis=1, inplace=True) 

In [371]:
# OTG replace value with 0 or 1

for index, row in data.iterrows():
    line = row['OTG']
    data.at[index, 'OTG'] = 0
    if isinstance(line, str):
        if line == '✅':
            data.at[index, 'OTG'] = 1

for index, row in data.iterrows():
    line = row['Protection']
    data.at[index, 'Protection'] = 1
    if isinstance(line, str):
        if line == '✖':
            data.at[index, 'Protection'] = 0

# same of type C

# change the col name to Type C
data.rename(columns={'USB Type-C':'TypeC'}, inplace=True)

for index, row in data.iterrows():
    line = row['TypeC']
    data.at[index, 'TypeC'] = 0
    if isinstance(line, str):
        if line == '✅':
            data.at[index, 'TypeC'] = 1

for index, row in data.iterrows():
    line = row['NFC']
    data.at[index, 'NFC'] = 0
    if isinstance(line, str):
        if line == '✅':
            data.at[index, 'NFC'] = 1

data.rename(columns={'Technology':'Display'}, inplace=True)




In [372]:
# Water Resistance

data.insert(6, 'WR_version', np.nan)

for index, row in data.iterrows():
    line = row['Water Resistance']
    temp = 0
    if isinstance(line, str):
        regex = re.compile(r'\d\d')
        all = regex.search(line)
        if all:
            temp = float(all.group())

    data.at[index, 'WR_version'] = temp

# drop the old column
data.drop('Water Resistance', axis=1, inplace=True) 

In [373]:
# Camera resulotion in Megapixel

data.insert(2, 'Camera', np.nan)

for index, row in data.iterrows():
    line = row['Resolution']
    temp = 0
    if isinstance(line, str):
        regex = re.compile(r'\d+')
        all = regex.search(line)
        if all:
            temp = float(all.group())

    data.at[index, 'Camera'] = temp

# drop the old column
data.drop('Resolution', axis=1, inplace=True) 

In [374]:
# Fast Charging in Watt

data.insert(7, 'FastCharging', np.nan)

for index, row in data.iterrows():
    line = row['Fast Charging']
    temp = 0
    if isinstance(line, str):
        regex = re.compile(r'\d+')
        all = regex.search(line)
        if all:
            temp = float(all.group())

    data.at[index, 'FastCharging'] = temp

# drop the old column
data.drop('Fast Charging', axis=1, inplace=True) 

In [375]:
# Number of Sensors

data.insert(7, 'NumOfSensors', np.nan)

for index, row in data.iterrows():
    line = row['Sensors']
    temp = 1
    if isinstance(line,str):
        words = line.split(',')
        temp = len(words)

    data.at[index, 'NumOfSensors'] = temp

# drop the old column
data.drop('Sensors', axis=1, inplace=True) 

### preposs

In [376]:
data

Unnamed: 0,PriceRange,ReleaseYear,Camera,AvailableColors,Ram_GB,NetworkSupport,BluetoothVersion,NumOfSensors,FastCharging,WR_version,OTG,USB_Version,TypeC,NFC,Display,Protection,Video Recording,Type and Capacity,Reverse Charge,Operating System,Chipset,Processor,GPU,ROM,Fingerprint,Face Unlock,More Features,Manufactured by,Wireless Charging,Reverse Charging,More,Back,Front,Stand-by,Audio Format,Video Format,CPU,Video Playback,External Slot,Color,Infrared,Others,Variant,Version,Other Features,First Release:
0,4.0,2022.0,32.0,4.0,8.0,4.0,5.0,5.0,25.0,68.0,1,3.2,1,1,Dynamic AMOLED 2X Touchscreen,1,"4K (2160p), gyro-EIS",Lithium-polymer 4500 mAh (non-removable),✅ 4.5W Reverse Wireless Charging,Android 12 (One UI 4),Exynos 2100 (5 nm),"Octa core, up to 2.9 GHz",Mali-G78 MP14,128 GB,✅ In-display (optical),✅,"– Samsung Pay (Visa, MasterCard certified)\n– ...",Samsung,,,,,,,,,,,,,,,,,,
1,1.0,2021.0,5.0,2.0,4.0,3.0,5.0,3.0,15.0,0.0,1,2.0,1,0,PLS IPS Touchscreen,0,,Lithium-polymer 5000 mAh (non-removable),,Android 11 (One UI 3.1 Core),MediaTek Helio P35 (12nm),"Octa core, up to 2.35 GHz",PowerVR GE8320,64 GB (eMMC 5.1),✅ Side-mounted,✅,,Samsung,,,,,,,,,,,,,,,,,,
2,3.0,2021.0,32.0,4.0,8.0,4.0,5.0,5.0,25.0,67.0,1,2.0,1,0,Super AMOLED Touchscreen,1,Ultra HD 4K (2160p),Lithium-polymer 4500 mAh (non-removable),,Android 11 (One UI 3.1),Qualcomm Snapdragon 778G 5G (6 nm),"Octa core, up to 2.4 GHz",Adreno 642L,128 GB,✅ In-display (optical),✅,,Samsung,,,,,,,,,,,,,,,,,,
3,1.0,2021.0,13.0,2.0,6.0,3.0,5.0,5.0,15.0,0.0,1,2.0,1,0,Super AMOLED Touchscreen,0,Full HD (1080p),Lithium-polymer 6000 mAh (non-removable),,Android 11 (One UI Core 3.1),Mediatek Helio G80 (12 nm),"Octa core, up to 2.0 GHz",Mali-G52 MC2,128 GB (eMMC 5.1),✅ Side-mounted,✅,,Samsung,,,,,,,,,,,,,,,,,,
4,4.0,2021.0,4.0,3.0,12.0,4.0,5.2,6.0,25.0,30.0,1,3.2,1,1,Foldable Dynamic AMOLED 2X Touchscreen,1,"Ultra HD 4K (2160p), gyro-EIS",Lithium-polymer 4400 mAh (non-removable),,Android 11 (One UI 3.1.1),Qualcomm Snapdragon 888 5G (5 nm),"Octa core, up to 2.84 GHz",Adreno 660,256 / 512 GB (UFS 3.1),✅ Side-mounted,✅,,Samsung,✅ 11W Fast wireless charging,✅ 4.5W Reverse wireless charging,"– Bixby\n– Samsung DeX\n– Samsung Pay (Visa, M...",,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
489,1.0,2018.0,8.0,3.0,3.0,3.0,4.2,4.0,0.0,0.0,1,2.0,0,0,IPS LCD Touchscreen,1,HD (720p),Lithium-ion 2990 mAh (non-removable),,"Android Oreo v8.0, upgradable to Android Pie v9.0",Mediatek MT6750 (28 nm),"Octa-core, up to 1.5 GHz",Mali-T860 MP2,16 / 32 GB (eMMC 5.1),✖,✅,,Nokia,,,,,,,,,,,,,,,,,,
490,2.0,2017.0,13.0,4.0,6.0,3.0,5.0,5.0,18.0,54.0,1,3.1,1,1,IPS LCD Touchscreen,1,Ultra HD (2160p),Lithium-ion 3090 mAh (non-removable),,"Android Nougat v7.1.1, upgradable to Android P...",Qualcomm Snapdragon 835 (10 nm),"Octa-core, up to 2.5 GHz",Adreno 540,64 / 128 GB (UFS 2.1),✅ On the front,✅,,Nokia,,,,,,,,,,,,,,,,,,
491,0.0,2017.0,8.0,4.0,2.0,3.0,4.0,4.0,0.0,0.0,1,2.0,0,0,IPS LCD Touchscreen,1,HD (720p),Lithium-ion 2630 mAh (non-removable),,"Android Nougat v7.0, upgradable to Android Pie...",Mediatek MT6737 (28 nm),"Quad-core, 1.4 GHz",Mali-T720 MP1,16 GB (eMMC 5.0),✖,✖,,Nokia,,,,,,,,,,,,,,,,,,
492,0.0,2017.0,5.0,3.0,1.0,3.0,4.1,3.0,0.0,0.0,1,2.0,0,0,LTPS IPS LCD Touchscreen,1,HD (720p),Lithium-ion 4100 mAh (non-removable),,"Android Nougat v7.1.1, upgradable to Android O...",Qualcomm Snapdragon 212 (28 nm),"Quad-core, 1.3 GHz",Adreno 304,8 GB (eMMC 4.5),✖,✖,,Nokia,,,,,,,,,,,,,,,,,,
