In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as seb 
from collections import Counter
from zipfile import ZipFile
import re

In [62]:
# downloading datafile
df = pd.read_csv('laptop_data.csv')
index_column = df.columns.values[0]
df.drop(labels=[index_column] , axis = 1 , inplace=True)
df.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [63]:
# charaterizing each column with the correct value

# the company column is good as it is
# type columns is also good as it is 
# inches is also in good cond

# can be split into type of hd (full , ultra, +) / is_touchscreen / is 4k / has_ips / resolution(aXb)

def separate_resolution(data):
    match = re.search(r'(Quad HD\+)|(Full)|(Ultra)', data)
    HD_status = next((val for val in match.groups() if val), None) if match else None

    is_4k = bool(re.search(r'4k', data, re.IGNORECASE))
    has_ips = bool(re.search(r'IPS', data, re.IGNORECASE))
    is_touchscreen = bool(re.search(r'Touchscreen', data, re.IGNORECASE))

    res_match = re.search(r'(\d{3,5})x(\d{3,5})', data)
    res_x, res_y = res_match.groups() if res_match else (None, None)

    return pd.Series([HD_status, is_4k, has_ips, is_touchscreen, res_x, res_y])

separate_resolution('Full 4k HD / Touchscreen 1920x1080')

0     Full
1     True
2    False
3     True
4     1920
5     1080
dtype: object

In [64]:
# the cpu status can be modelled with 
# if possible generation of processor(if possible) / company / clock speed

def seperate_cpu(data): 
    # extracting the generation of the processor is absolute mad man
    company = data.split(' ')[0]
    data = data.replace('GHz','')  
    clock_speed = data.split(' ')[-1]
    return pd.Series([company , clock_speed])

def serperate_ram(data): 
    return pd.Series(data.replace('GB',''))

seperate_cpu('Intel Core i5 7200U 2GHz')

0    Intel
1        2
dtype: object

In [65]:
# this can defenetly split into a lot of classes
def process_memory(memory):
    if(memory):
        value = memory.split(' ')[0]
        type = memory.split(' ')[1]
        return [value , type] 
    else: 
        return [0 , 'useless'] 

def seperate_memory(data): 
    data = data.replace('.0','')
    data = data.replace('TB' ,'000')    
    data = data.replace('GB' , '')
    data = data.replace('h S' , 'hS')
    data = data.split('+')
    primary_memory = data[0].strip()
    secondary_memory = None
    if(len(data) == 2): 
        secondary_memory = data[1].strip()
    return pd.Series(process_memory(primary_memory)+(process_memory(secondary_memory)))
seperate_memory('512GB SSD +  1.0TB Hybrid') 


0       512
1       SSD
2      1000
3    Hybrid
dtype: object

In [66]:
df['Gpu'] = df['Gpu'].apply(lambda s : s.split()[0])
Counter(df['Gpu'])
# this is enough for the Gpu side since finding the generation would be very hard

Counter({'Intel': 722, 'Nvidia': 400, 'AMD': 180, 'ARM': 1})

In [67]:
df['Weight'] = df['Weight'].apply(lambda a : a.replace('kg',''))
df['Weight']
# weights are turned into actual numbers

0       1.37
1       1.34
2       1.86
3       1.83
4       1.37
        ... 
1298     1.8
1299     1.3
1300     1.5
1301    2.19
1302     2.2
Name: Weight, Length: 1303, dtype: object

In [68]:
# converting the whole df into a completed one 
resolution = ['HD_type' , 'is_4k', 'has_ips' ,'is_touchscreen', 'resolution1' , 'resolution2']
Memory_ = ['primary_mem_val','primary_mem_type','secondary_mem_val','secondary_mem_type']
ram = ['ram_val']
cpu = ['type','clock_speed']

df[resolution] = df['ScreenResolution'].apply(separate_resolution)
df[Memory_] = df['Memory'].apply(seperate_memory)
df[ram] = df['Ram'].apply(serperate_ram)
df[cpu] = df['Cpu'].apply(seperate_cpu)

df.drop(['ScreenResolution','Memory','Ram','Cpu'] , axis = 1 , inplace=True)



In [70]:
Counter(df['is_4k'])
keys = df.columns.values
for val in keys: 
    print(Counter(df[val]))

Counter({'Dell': 297, 'Lenovo': 297, 'HP': 274, 'Asus': 158, 'Acer': 103, 'MSI': 54, 'Toshiba': 48, 'Apple': 21, 'Samsung': 9, 'Razer': 7, 'Mediacom': 7, 'Microsoft': 6, 'Xiaomi': 4, 'Vero': 4, 'Chuwi': 3, 'Google': 3, 'Fujitsu': 3, 'LG': 3, 'Huawei': 2})
Counter({'Notebook': 727, 'Gaming': 205, 'Ultrabook': 196, '2 in 1 Convertible': 121, 'Workstation': 29, 'Netbook': 25})
Counter({15.6: 665, 14.0: 197, 13.3: 164, 17.3: 164, 12.5: 39, 11.6: 33, 12.0: 6, 13.5: 6, 13.9: 6, 12.3: 5, 15.4: 4, 10.1: 4, 15.0: 4, 13.0: 2, 18.4: 1, 17.0: 1, 14.1: 1, 11.3: 1})
Counter({'Intel': 722, 'Nvidia': 400, 'AMD': 180, 'ARM': 1})
Counter({'Windows 10': 1072, 'No OS': 66, 'Linux': 62, 'Windows 7': 45, 'Chrome OS': 27, 'macOS': 13, 'Mac OS X': 8, 'Windows 10 S': 8, 'Android': 2})
Counter({'2.2': 121, '2.1': 58, '2.4': 44, '2.3': 41, '2.5': 38, '2': 35, '2.8': 28, '1.2': 26, '1.86': 25, '2.04': 24, '1.4': 24, '2.18': 23, '1.9': 21, '1.5': 21, '1.6': 20, '1.7': 18, '1.95': 18, '1.3': 17, '1.8': 17, '1.1': 1