In [1]:
import pandas as pd

# Pré-processamento dos Dados

### Carregar Base de dados

In [2]:
laptop_price = pd.read_csv("../datasets/raw/laptop_price.csv", encoding="ISO-8859-1")
laptop_price.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


### Verificar tipo das colunas e dropar coluna laptop_id

In [3]:
laptop_price = laptop_price.rename(columns={'ScreenResolution': 'screen_resolution'})
laptop_price.columns = laptop_price.columns.str.lower()
laptop_price = laptop_price.drop(columns='laptop_id')
laptop_price.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   company            1303 non-null   object 
 1   product            1303 non-null   object 
 2   typename           1303 non-null   object 
 3   inches             1303 non-null   float64
 4   screen_resolution  1303 non-null   object 
 5   cpu                1303 non-null   object 
 6   ram                1303 non-null   object 
 7   memory             1303 non-null   object 
 8   gpu                1303 non-null   object 
 9   opsys              1303 non-null   object 
 10  weight             1303 non-null   object 
 11  price_euros        1303 non-null   float64
dtypes: float64(2), object(10)
memory usage: 904.4 KB


### Verificar quantidade de valores únicos

In [4]:
print("Quantidade de Valores únicos por coluna")
for column in laptop_price.columns:
    print(f"{column}: {len(laptop_price[column].unique())}")

Quantidade de Valores únicos por coluna
company: 19
product: 618
typename: 6
inches: 18
screen_resolution: 40
cpu: 118
ram: 9
memory: 39
gpu: 110
opsys: 9
weight: 179
price_euros: 791


### Verificar valores únicos na coluna memory

In [5]:
laptop_price.memory.unique()

array(['128GB SSD', '128GB Flash Storage', '256GB SSD', '512GB SSD',
       '500GB HDD', '256GB Flash Storage', '1TB HDD',
       '32GB Flash Storage', '128GB SSD +  1TB HDD',
       '256GB SSD +  256GB SSD', '64GB Flash Storage',
       '256GB SSD +  1TB HDD', '256GB SSD +  2TB HDD', '32GB SSD',
       '2TB HDD', '64GB SSD', '1.0TB Hybrid', '512GB SSD +  1TB HDD',
       '1TB SSD', '256GB SSD +  500GB HDD', '128GB SSD +  2TB HDD',
       '512GB SSD +  512GB SSD', '16GB SSD', '16GB Flash Storage',
       '512GB SSD +  256GB SSD', '512GB SSD +  2TB HDD',
       '64GB Flash Storage +  1TB HDD', '180GB SSD', '1TB HDD +  1TB HDD',
       '32GB HDD', '1TB SSD +  1TB HDD', '512GB Flash Storage',
       '128GB HDD', '240GB SSD', '8GB SSD', '508GB Hybrid', '1.0TB HDD',
       '512GB SSD +  1.0TB Hybrid', '256GB SSD +  1.0TB Hybrid'],
      dtype=object)

### Criar colunas ssd, hd

In [6]:
# Pegar os tipos de armazenamento de cada linha
tipo_armazenamento = []

for row in laptop_price['memory']:
    row = row.lower()
        
    if 'hybrid' in row and 'ssd' in row:
        tipo_armazenamento.append("Hybrid + SSD")
    elif 'hybrid' in row and 'HDD' in row:
        tipo_armazenamento.append("Hybrid + HDD")
    elif 'hdd' in row and 'ssd' in row:
        tipo_armazenamento.append("HDD + SSD")
    elif 'flash storage' in row and 'hdd' in row:
        tipo_armazenamento.append('flash storage + HDD')
    elif 'flash storage' in row and 'ssd' in row:
        tipo_armazenamento.append('flash storage + SSD')
    elif 'hdd' in row:
        tipo_armazenamento.append("HDD")
    elif 'flash storage' in row:
        tipo_armazenamento.append('Flash Storage')
    elif 'ssd' in row:
        tipo_armazenamento.append('SSD')
    elif 'hybrid' in row:
        tipo_armazenamento.append('Hybrid')
    
laptop_price['storage_type'] = tipo_armazenamento

### Filtrar coluna memory and ram

In [7]:
def filter_memory(row):     
    disposable_words = ['+', "ssd", "hdd", "hybrid", "flash", "storage"] 
    
    row_processed = [word for word in row.lower().split() if word not in disposable_words]
    
    for i, word in enumerate(row_processed):
        if "gb" in word.lower():
            row_processed[i] = row_processed[i].replace("gb", "")
        elif "tb" in word.lower():
            if "1.0" in word.lower():
                row_processed[i] = row_processed[i].replace("1.0tb", "1000")
            else:
                row_processed[i] = row_processed[i].replace("tb", "000")
            
    row_processed = list(map(int, row_processed))
    row_processed = sum(row_processed)
    
    return row_processed
            
filter_memory(laptop_price['memory'][5])

laptop_price['memory'] = laptop_price.memory.apply(filter_memory)
laptop_price['ram'] = laptop_price.ram.apply(filter_memory)

### Filtrar coluna screen_resolution (Em dúvida)

In [8]:
def filter_resolution(row):
    row_processed = row.split()
    
    if len(row_processed) > 1:
        row_processed = row_processed[len(row_processed) - 1]
    else:
        row_processed = row_processed[0]

    return row_processed

#laptop_price['screen_resolution'] = laptop_price['screen_resolution'].apply(filter_resolution)

### Filtrar Coluna Weight e Renomear Colunas

In [9]:
laptop_price['weight'] = laptop_price['weight'].apply(lambda row: row.lower().replace("kg", ""))
laptop_price = laptop_price.rename(columns={'ram': 'ram(gb)', 'memory': 'memory_total(gb)', 'weight': 'weight(kg)'})
laptop_price.head()

Unnamed: 0,company,product,typename,inches,screen_resolution,cpu,ram(gb),memory_total(gb),gpu,opsys,weight(kg),price_euros,storage_type
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,SSD
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128,Intel HD Graphics 6000,macOS,1.34,898.94,Flash Storage
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256,Intel HD Graphics 620,No OS,1.86,575.0,SSD
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512,AMD Radeon Pro 455,macOS,1.83,2537.45,SSD
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256,Intel Iris Plus Graphics 650,macOS,1.37,1803.6,SSD


### Criar colunas cpu e gpu manufacturer

In [10]:
gpu_manufacturer = []
cpu_manufacturer = []

for cpu, gpu in zip(laptop_price.cpu, laptop_price.gpu):
    cpu, gpu = cpu.split()[0], gpu.split()[0]
    cpu_manufacturer.append(cpu), gpu_manufacturer.append(gpu)
    
laptop_price['cpu_manufacturer'] = cpu_manufacturer
laptop_price['gpu_manufacturer'] = gpu_manufacturer

### ver valores de máximo e minimo

In [11]:
laptop_price.describe()

Unnamed: 0,inches,ram(gb),memory_total(gb),price_euros
count,1303.0,1303.0,1303.0,1303.0
mean,15.017191,8.382195,610.904068,1123.686992
std,1.426304,5.084665,467.509021,699.009043
min,10.1,2.0,8.0,174.0
25%,14.0,4.0,256.0,599.0
50%,15.6,8.0,500.0,977.0
75%,15.6,8.0,1000.0,1487.88
max,18.4,64.0,2512.0,6099.0


### Fazer downcasting das colunas

In [12]:
to_float16 = ['weight(kg)', 'price_euros']
to_int16 = ['ram(gb)', 'memory_total(gb)', 'inches']
to_category = ['company', 'typename', 'screen_resolution', 'cpu', 'gpu', 'opsys', 'product', 'storage_type',
               'gpu_manufacturer', 'cpu_manufacturer']

def downcasting(df, columns, type):
    for column in columns:
        df[column] = df[column].astype(type)

downcasting(laptop_price, to_float16, 'float16')
downcasting(laptop_price, to_int16, 'int16')
downcasting(laptop_price, to_category, 'category')

### Ver Redução de memoria

In [13]:
laptop_price.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   company            1303 non-null   category
 1   product            1303 non-null   category
 2   typename           1303 non-null   category
 3   inches             1303 non-null   int16   
 4   screen_resolution  1303 non-null   category
 5   cpu                1303 non-null   category
 6   ram(gb)            1303 non-null   int16   
 7   memory_total(gb)   1303 non-null   int16   
 8   gpu                1303 non-null   category
 9   opsys              1303 non-null   category
 10  weight(kg)         1303 non-null   float16 
 11  price_euros        1303 non-null   float16 
 12  storage_type       1303 non-null   category
 13  cpu_manufacturer   1303 non-null   category
 14  gpu_manufacturer   1303 non-null   category
dtypes: category(10), float16(2), int16(3)
memory usage: 123

### Salvar para Pickle

In [14]:
laptop_price.to_pickle("../datasets/processed/laptop_price.pkl")