In [1]:
import os
import pandas as pd
from pandas import DataFrame

In [2]:
# Define the folder containing the CSV files
FOLDER_PATH = '../Crawl'
FILE_PATH = '../Data/Data.csv'
CSV_FILE = '../Data/processed_data.csv'

In [3]:
#  # List to hold dataframes
# dfs = []
# 
# # Iterate over all files in the folder
# for filename in os.listdir(FOLDER_PATH):
#     if filename.endswith('.csv'):
#         file_path = os.path.join(FOLDER_PATH, filename)
#         # Read the CSV file into a dataframe
#         df = pd.read_csv(file_path)
#         # Append the dataframe to the list
#         dfs.append(df)
# 
# Concatenate all dataframes
# combined_df = pd.concat(dfs, ignore_index=True)
combined_df = pd.read_csv(FILE_PATH)

# Drop unnecessary columns 
# combined_df = combined_df.drop(columns=['page', 'name', 'link', 'image'])

In [4]:
def print_unique_values_with_counts(df: DataFrame, column: str, prefix: str):
    print(prefix)
    print(f"Column: {column}")
    print("Unique Values and Counts:")
    print(df[column].value_counts())
    print(f"Total Count: {df[column].count()}")
    print("-" * 100)

In [5]:
from mapping_brand import brand_mapping

def process_brand(data: DataFrame, column='brand') -> DataFrame:
    data[column] = data[column].replace(brand_mapping)
    data = data[~data[column].isin(['Prestige', 'kstation', 'gaming', 'Workstation', 'Samsung', 'iPad'])]
    
    return data

In [6]:
from mapping_cpu import cpu_mapping

def process_cpu(data: DataFrame, column='cpu') -> DataFrame:
    data[column] = data[column].str.replace("CPU ", "", regex=False)
    data[column] = data[column].replace(cpu_mapping)
    return data

In [7]:
from mapping_cpu_brand import cpu_brand_mapping

def process_cpu_brand(data: DataFrame, column='cpu_brand') -> DataFrame:
    data[column] = data[column].replace(cpu_brand_mapping)
    data = data[~data[column].isin(['CPU', 'Chip'])]
    
    return data

In [8]:
def process_ram_capacity(data: DataFrame, column='ram_capacity') -> DataFrame:
    data[column] = data[column].str.extract(r"(\d+)").astype(float)
    return data

In [9]:
from mapping_ram_brand import ram_brand_mapping

def process_ram_brand(data: DataFrame, column='ram_brand') -> DataFrame:
    data[column] = data[column].str.strip().replace(ram_brand_mapping)
    data = data[~data[column].isin(["3200", "5600", "4800", "5200", "7467", "6400", "7500", "1TB", "Soldered"])]
    return data

In [10]:
def process_hard_drive_capacity(data: DataFrame, column='hard_drive_capacity') -> DataFrame:
    data[column] = data[column].str.extract(r"(\d+)")[0].astype(float).apply(lambda x: f"{int(x)}TB" if x in [1, 2, 4] else ("1TB" if x == 1024 else (f"{int(x)}GB" if x in [128, 256, 512] else None)))
    return data

In [11]:
def process_hard_drive_type(data: DataFrame, column='hard_drive_type') -> DataFrame:
    data = data[data[column].isin(['SSD', 'HDD'])]
    return data

In [12]:
from mapping_card import card_mapping

def process_card(data: DataFrame, column='card') -> DataFrame:
    data[column] = data[column].str.replace("Card rời ", "", regex=False)
    data[column] = data[column].str.replace("Nvidia", "NVIDIA", regex=False)
    data[column] = data[column].str.replace("NVidia", "NVIDIA", regex=False)
    data[column] = data[column].str.replace("Geforce", "GeForce", regex=False)
    data[column] = data[column].str.replace(
        r'^(GeForce|Quadro)',
        'NVIDIA GeForce',
        regex=True
    )
    data[column] = data[column].replace(card_mapping)
    return data

In [13]:
from mapping_card_brand import card_brand_mapping

def process_card_brand(data: DataFrame, column='card_brand') -> DataFrame:
    data[column] = data[column].str.replace("Nvidia", "NVIDIA", regex=False)
    data[column] = data[column].str.replace("NVidia", "NVIDIA", regex=False) 
    data[column] = data[column].replace(card_brand_mapping)
    return data 

In [14]:
#Process card
print_unique_values_with_counts(combined_df, column='card', prefix='BEFORE')
combined_df = process_card(combined_df)
print_unique_values_with_counts(combined_df, column='card', prefix='AFTER')

BEFORE
Column: card
Unique Values and Counts:
card
Intel Iris Xe Graphics                         441
Intel Arc Graphics                             264
Intel UHD Graphics                             217
Intel Graphics                                 186
NVIDIA GeForce RTX 4060                         89
                                              ... 
Intel Iris Xe Graphics | Intel UHD Graphics      1
Intel UHD Graphics | NVIDIA Quadro P520          1
NVIDIA GeForce RTX 4060 6GB GDDR6                1
Apple GPU 16 Core                                1
NVIDIA GeForce RTX3080 16GB                      1
Name: count, Length: 192, dtype: int64
Total Count: 2203
----------------------------------------------------------------------------------------------------
AFTER
Column: card
Unique Values and Counts:
card
Intel Iris Xe Graphics                         442
Intel Arc Graphics                             264
Intel UHD Graphics                             217
Intel Graphics             

In [15]:
# Process brand
print_unique_values_with_counts(combined_df, column='brand', prefix='BEFORE')
combined_df = process_brand(combined_df)
print_unique_values_with_counts(combined_df, column='brand', prefix='AFTER')

BEFORE
Column: brand
Unique Values and Counts:
brand
Lenovo       696
Asus         356
HP           336
Dell         326
Acer         230
MSI          168
Macbook       38
LG            35
Gaming        17
Microsoft     10
Name: count, dtype: int64
Total Count: 2212
----------------------------------------------------------------------------------------------------
AFTER
Column: brand
Unique Values and Counts:
brand
Lenovo       696
Asus         356
HP           336
Dell         326
Acer         230
MSI          168
Macbook       38
LG            35
Gaming        17
Microsoft     10
Name: count, dtype: int64
Total Count: 2212
----------------------------------------------------------------------------------------------------


In [16]:
# Process cpu
print_unique_values_with_counts(combined_df, column='cpu', prefix='BEFORE')
combined_df = process_cpu(combined_df)
print_unique_values_with_counts(combined_df, column='cpu', prefix='AFTER')

BEFORE
Column: cpu
Unique Values and Counts:
cpu
Intel Core Ultra 7 155H    174
Intel Core i71355U          86
Intel Core Ultra 5 125H     74
Intel Core i51335U          68
Intel Core Ultra 5 125U     40
                          ... 
AMD Ryzen 5 6600H            1
Apple M4                     1
Apple M4 (10-core)           1
Intel Core i5-1334U          1
AMD Ryzen 5 4650U            1
Name: count, Length: 355, dtype: int64
Total Count: 2206
----------------------------------------------------------------------------------------------------
AFTER
Column: cpu
Unique Values and Counts:
cpu
Intel Core Ultra 7 155H     174
Intel Core i71355U           86
Intel Core Ultra 5 125H      74
Intel Core i51335U           68
Intel Core Ultra 5 125U      40
                           ... 
Apple M4                      1
Apple M4 (10-core)            1
Intel Core i5-1334U           1
Intel Core  Ultra 5 125H      1
AMD Ryzen 5 4650U             1
Name: count, Length: 352, dtype: int64
Total Count: 

In [17]:
# Process cpu_brand
print_unique_values_with_counts(combined_df, column='cpu_brand', prefix='BEFORE')
combined_df = process_cpu_brand(combined_df)
print_unique_values_with_counts(combined_df, column='cpu_brand', prefix='AFTER')

BEFORE
Column: cpu_brand
Unique Values and Counts:
cpu_brand
Intel       1862
AMD          286
Apple         36
Qualcomm      16
Ryzen         13
Name: count, dtype: int64
Total Count: 2213
----------------------------------------------------------------------------------------------------
AFTER
Column: cpu_brand
Unique Values and Counts:
cpu_brand
Intel       1862
AMD          299
Apple         36
Qualcomm      16
Name: count, dtype: int64
Total Count: 2213
----------------------------------------------------------------------------------------------------


In [18]:
# # Process ram_capacity
# print_unique_values_with_counts(combined_df, column='ram_capacity', prefix='BEFORE')
# combined_df = process_ram_capacity(combined_df)
# print_unique_values_with_counts(combined_df, column='ram_capacity', prefix='AFTER')

In [19]:
# Process ram_brand
print_unique_values_with_counts(combined_df, column='ram_brand', prefix='BEFORE')
combined_df = process_ram_brand(combined_df)
print_unique_values_with_counts(combined_df, column='ram_brand', prefix='AFTER')

BEFORE
Column: ram_brand
Unique Values and Counts:
ram_brand
DDR5       1143
DDR4        601
LPDDR5      163
LPDDR5X     123
LPDDR4X      36
LPDDR4       12
LPDDR3       11
LDDR4         1
Name: count, dtype: int64
Total Count: 2090
----------------------------------------------------------------------------------------------------
AFTER
Column: ram_brand
Unique Values and Counts:
ram_brand
DDR5       1143
DDR4        601
LPDDR5      163
LPDDR5X     123
LPDDR4X      36
LPDDR4       12
LPDDR3       11
LDDR4         1
Name: count, dtype: int64
Total Count: 2090
----------------------------------------------------------------------------------------------------


In [20]:
# Process hard_drive_capacity
print_unique_values_with_counts(combined_df, column='hard_drive_capacity', prefix='BEFORE')
combined_df = process_hard_drive_capacity(combined_df)
print_unique_values_with_counts(combined_df, column='hard_drive_capacity', prefix='AFTER')

BEFORE
Column: hard_drive_capacity
Unique Values and Counts:
hard_drive_capacity
512GB     1130
1TB        559
512 GB     201
256GB       98
1 TB        71
2TB         50
256 GB      11
4TB          5
2 TB         4
128GB        4
Name: count, dtype: int64
Total Count: 2133
----------------------------------------------------------------------------------------------------
AFTER
Column: hard_drive_capacity
Unique Values and Counts:
hard_drive_capacity
512GB    1331
1TB       630
256GB     109
2TB        54
4TB         5
128GB       4
Name: count, dtype: int64
Total Count: 2133
----------------------------------------------------------------------------------------------------


In [21]:
# Process hard_drive_type
print_unique_values_with_counts(combined_df, column='hard_drive_type', prefix='BEFORE')
combined_df = process_hard_drive_type(combined_df)
print_unique_values_with_counts(combined_df, column='hard_drive_type', prefix='AFTER')

BEFORE
Column: hard_drive_type
Unique Values and Counts:
hard_drive_type
SSD    2179
512      12
1TB       5
9Z2       3
256       1
2TB       1
Name: count, dtype: int64
Total Count: 2201
----------------------------------------------------------------------------------------------------
AFTER
Column: hard_drive_type
Unique Values and Counts:
hard_drive_type
SSD    2179
Name: count, dtype: int64
Total Count: 2179
----------------------------------------------------------------------------------------------------


In [22]:
# Process card 
print_unique_values_with_counts(combined_df, column='card', prefix='BEFORE')
combined_df = process_card(combined_df)
print_unique_values_with_counts(combined_df, column='card', prefix='AFTER')

BEFORE
Column: card
Unique Values and Counts:
card
Intel Iris Xe Graphics                     439
Intel Arc Graphics                         259
Intel UHD Graphics                         215
Intel Graphics                             184
NVIDIA GeForce RTX 4060                     88
                                          ... 
NVIDIA GeForce RTX 3050Ti                    1
AMD Radeon RX7600S 8GB GDDR6                 1
Intel UHD Graphics | NVIDIA Quadro P520      1
NVIDIA GeForce RTX 4060 6GB GDDR6            1
NVIDIA GeForce RTX3080 16GB                  1
Name: count, Length: 186, dtype: int64
Total Count: 2169
----------------------------------------------------------------------------------------------------
AFTER
Column: card
Unique Values and Counts:
card
Intel Iris Xe Graphics                     439
Intel Arc Graphics                         259
Intel UHD Graphics                         215
Intel Graphics                             184
NVIDIA GeForce RTX 4060             

In [23]:
# Process card_brand
print_unique_values_with_counts(combined_df, column='card_brand', prefix='BEFORE')
combined_df = process_card_brand(combined_df)
print_unique_values_with_counts(combined_df, column='card_brand', prefix='AFTER')

BEFORE
Column: card_brand
Unique Values and Counts:
card_brand
Intel         1101
Nvidia         456
NVIDIA         177
AMD            118
Apple           79
RTX             73
GeForce         49
Card            36
Iris            14
Quadro          13
Geforce          8
Qualcomm         7
UHD              7
GPU              5
Integrated       4
Qualcom          3
Radeon           3
Arc              2
30-core          2
SD               2
NVidia           2
Không            1
Adreno           1
10-core          1
RX               1
8-Core           1
14-core          1
18-core          1
eForce           1
Name: count, dtype: int64
Total Count: 2169
----------------------------------------------------------------------------------------------------
AFTER
Column: card_brand
Unique Values and Counts:
card_brand
Intel       1124
NVIDIA       779
AMD          122
Apple         79
Qualcomm      11
Name: count, dtype: int64
Total Count: 2115
--------------------------------------------------

In [24]:
# Save the combined dataframe to a new CSV file
combined_df.to_csv(CSV_FILE, index=False)