In [3]:
import os
import re
import pandas as pd
from pandas import DataFrame

In [4]:
# Define the folder containing the CSV files
FOLDER_PATH = '../Dai'
CSV_FILE = '../dai_combined.csv'

In [5]:
 # List to hold dataframes
dfs = []

# Iterate over all files in the folder
for filename in os.listdir(FOLDER_PATH):
    if filename.endswith('.csv'):
        file_path = os.path.join(FOLDER_PATH, filename)
        # Read the CSV file into a dataframe
        df = pd.read_csv(file_path)
        # Append the dataframe to the list
        dfs.append(df)

# Concatenate all dataframes
combined_df = pd.concat(dfs, ignore_index=True)

# Drop unnecessary columns 
combined_df = combined_df.drop(columns=['page', 'name', 'link'])

In [6]:
combined_df.head()

Unnamed: 0,brand,price,old,new,cpu,cpu_brand,ram_capacity,ram_brand,hard_drive_type,hard_drive_capacity,card,card_brand,screen_size,screen_type
0,Laptop Asus,26990000.0,0,1,Intel Core Ultra 5 125H,Intel,16.0,DDR5,SSD,512GB,Intel Arc Graphics,Intel,14.0,OLED
1,Lenovo,16390000.0,0,1,AMD Ryzen 77730U,AMD,16.0,DDR4,SSD,512GB,AMD Radeon Graphics,AMD,15.6,HD
2,Laptop ASUS,28990000.0,0,1,Intel Core Ultra 5 125H,Intel,16.0,DDR5,SSD,512GB,Intel Arc Graphics,Intel,14.0,OLED
3,Laptop LG,36290000.0,0,1,Intel Core Ultra 5 125H,Intel,16.0,DDR5,SSD,512GB,Intel Arc graphics,Intel,14.0,IPS
4,Laptop Dell,18990000.0,0,1,Intel Core 5 processor 120U,Intel,8.0,DDR5,SSD,512GB,Intel Graphics,Intel,14.0,HD


In [7]:
def print_unique_values_with_counts(df: DataFrame, column: str, prefix: str):
    print(prefix)
    print(f"Column: {column}")
    print("Unique Values and Counts:")
    print(df[column].value_counts())
    print(f"Total Count: {df[column].count()}")
    print("-" * 100)

In [8]:
from mapping_brand import brand_mapping

def process_brand(data: DataFrame, column='brand') -> DataFrame:
    data[column] = data[column].str.replace('Laptop ', '', regex=False)
    data = data[~data[column].isin(['Máy', 'Creator', 'Stealth', 'Gaming', 'Laptop', 'Samsung','Aspire'])]
    data[column] = data[column].replace(brand_mapping) 

    return data

In [9]:
from mapping_cpu import cpu_mapping

def process_cpu(data: DataFrame, column='cpu') -> DataFrame:
    data['cpu'] = data['cpu'].astype(str)
    data['cpu'] = data['cpu'].replace(cpu_mapping)
    data['cpu'] = data['cpu'].apply(lambda x: re.sub(r'[\xa0]', ' ', x) if isinstance(x, str) else x)
    data['cpu'] = data['cpu'].apply(lambda x: x.strip())
    return data

In [10]:
from mapping_card import card_mapping

def process_card(data: DataFrame, column='card') -> DataFrame:
    data['card'] = data['card'].replace(card_mapping)
    data['card'] = data['card'].astype(str)
    data['card'] = data['card'].apply(lambda x: re.sub(r'(Card rời\-\s?|Card tích hợp\-\s?)', '', x) if isinstance(x, str) else x)
    data['card'] = data['card'].apply(lambda x: re.sub(r'[\xa0]', ' ', x) if isinstance(x, str) else x)
    data['card'] = data['card'].apply(lambda x: x.strip())
    data = data[~data['card'].isin(['Một cp su ty chọn. Dễ dng kết nối với mn hnh 4K hoặc 1080P ngoi bằng HDMI hoặc VGA mạng RJ45 c dy hai thiết bị USBA v nguồn điện USBC  tất cả thng qua một cp USBC duy nhất ẩn bn trong thiết kế tối giản',
     'Intel UHD Graphics (Iris Xe Graphics chỉ hoạt động với RAM kênh đôi)''])]
    return data

SyntaxError: unterminated string literal (detected at line 10) (2235943121.py, line 10)

In [None]:
from mapping_screen_type import screen_type_mapping

def process_screen_type(data: DataFrame, column='screen_type') -> DataFrame:
    data['screen_type'] = data['screen_type'].replace(screen_type_mapping)
    return data

In [None]:
#Process screen type
print_unique_values_with_counts(combined_df, column='screen_type', prefix='BEFORE')
combined_df = process_screen_type(combined_df)
print_unique_values_with_counts(combined_df, column='screen_type', prefix='AFTER')


BEFORE
Column: screen_type
Unique Values and Counts:
screen_type
HD         509
IPS        287
Full HD    169
OLED        54
8K          51
2K          44
WUXGA       40
2.8K        35
Retina      22
QHD          9
TFT          6
3.2K         2
4K/UHD       2
WQHD         1
2.2K         1
2.5K         1
Name: count, dtype: int64
Total Count: 1233
----------------------------------------------------------------------------------------------------
AFTER
Column: screen_type
Unique Values and Counts:
screen_type
HD         509
IPS        287
Full HD    169
OLED        54
8K          51
2K          44
WUXGA       40
2.8K        35
Retina      22
QHD          9
TFT          6
3.2K         2
4K/UHD       2
WQHD         1
2.2K         1
2.5K         1
Name: count, dtype: int64
Total Count: 1233
----------------------------------------------------------------------------------------------------


In [None]:
print(combined_df['screen_type'].unique())

['OLED' 'HD' 'IPS' '8K' '2K' nan 'TFT' 'WQHD' 'Full HD' 'Retina' 'WUXGA'
 '2.8K' '3.2K' 'QHD' '2.2K' '2.5K' '4K/UHD']


In [None]:
#Process card
print_unique_values_with_counts(combined_df, column='card', prefix='BEFORE')
combined_df = process_card(combined_df)
print_unique_values_with_counts(combined_df, column='card', prefix='AFTER')
# print(combined_df['card'].unique())

BEFORE
Column: card
Unique Values and Counts:
card
Intel Iris Xe Graphics                      185
Intel UHD Graphics                          123
Intel Arc Graphics                          105
Intel Graphics                              102
NVIDIA GeForce RTX 4060 8GB GDDR6            76
                                           ... 
Card tích hợp- 18 nhân GPU                    1
Card rời- NVIDIA GeForce RTX A500, 4 GB       1
Card rời- NVIDIA GeForce RTX 4090, 16 GB      1
AMD Radeon RX7600S 8GB GDDR6                  1
NVIDIA GeForce RTX 3050Ti 4GB GDDR6           1
Name: count, Length: 103, dtype: int64
Total Count: 1312
----------------------------------------------------------------------------------------------------
AFTER
Column: card
Unique Values and Counts:
card
Intel Iris Xe Graphics                272
Intel Arc Graphics                    163
Intel UHD Graphics                    148
Intel Graphics                        142
NVIDIA GeForce RTX 4060                76
   

In [None]:
print(combined_df['card'].unique())

['Intel Arc Graphics' 'AMD Radeon Graphics' 'Intel Graphics'
 'AMD Radeon 890M Graphics' 'Intel Iris Xe Graphics'
 'NVIDIA GeForce RTX 3050' 'NVIDIA GeForce RTX 4050' 'Intel UHD Graphics'
 'Intel Arc A-Series 140V' 'Intel Arc A-Series 130V' 'NVIDIA RTX 500 Ada'
 'AMD Radeon 760M Graphics' 'AMD 780M Graphics' 'AMD Radeon 880M Graphics'
 'NVIDIA RTX A500' 'Qualcomm Adreno GPU' 'Intel Arc 140V Graphics'
 'NVIDIA RTX A1000' 'NVIDIA RTX 1000 Ada Generation 6GB GDDR6'
 'AMD Radeon graphics' 'NVIDIA GeForce MX570' 'NVIDIA GeForce RTX 4060'
 'NVIDIA GeForce RTX 4070' 'Intel HD Graphics' 'AMD Radeon 780M Graphics'
 'AMD Radeon 780M' 'AMD Radeon 660M Graphics' 'AMD Radeon 610M Graphics'
 'NVIDIA GeForce MX570A' 'NVIDIA GeForce RTX 2050' 'NVIDIA GeForce MX550'
 'NVIDIA GeForce RTX 3050 4GB DDR6' 'Intel Iris Xe graphics' '7 nhân GPU'
 '10 nhân GPU' '8 nhân GPU' 'NVIDIA GeForce RTX 3050, 6 GB'
 'NVIDIA GeForce RTX 2050, 4 GB' 'NVIDIA GeForce RTX 4050, 6 GB'
 'NVIDIA GeForce RTX 4060, 8 GB' 'NVIDIA 

In [None]:
#Process cpu
print_unique_values_with_counts(combined_df, column='cpu', prefix='BEFORE')
combined_df = process_cpu(combined_df)
print_unique_values_with_counts(combined_df, column='cpu', prefix='AFTER')
print(combined_df['cpu'].unique())


BEFORE
Column: cpu
Unique Values and Counts:
cpu
Intel Core Ultra 7 155H     115
Intel Core i71355U           86
Intel Core i51335U           67
Intel Core Ultra 5 125H      51
Intel Core i713620H          35
                           ... 
Intel Core i71365U            1
Intel Core i51230U            1
Intel Core  i713700H          1
Intel Core i51235U            1
AMD Ryzen Z1 Extreme          1
Name: count, Length: 188, dtype: int64
Total Count: 1289
----------------------------------------------------------------------------------------------------
AFTER
Column: cpu
Unique Values and Counts:
cpu
Intel Core Ultra 7 155H            115
Intel Core i71355U                  86
Intel Core i51335U                  68
Intel Core Ultra 5 125H             51
Intel Core i713620H                 35
                                  ... 
Intel Core i5 Tiger Lake 1155G7      1
Intel Core i7 Alder Lake 1260P       1
Intel Core i3 Tiger Lake 1115G4      1
Intel Core i7 Alder Lake 12650H      1
AMD

In [None]:
# Process brand
print_unique_values_with_counts(combined_df, column='brand', prefix='BEFORE')
combined_df = process_brand(combined_df)
print_unique_values_with_counts(combined_df, column='brand', prefix='AFTER')

BEFORE
Column: brand
Unique Values and Counts:
brand
Laptop Lenovo      348
Laptop HP          187
Laptop Dell        167
Laptop Asus        146
Laptop Acer        139
Laptop MSI          77
HP                  65
Laptop ASUS         41
Laptop MacBook      22
Lenovo              21
Dell                19
Laptop LG           13
Acer                12
Laptop Gigabyte      9
MSI                  5
Gigabyte             5
LG                   3
Laptop Samsung       2
Máy                  2
Laptop Creator       2
Laptop               1
Laptop Stealth       1
Laptop Gaming        1
Laptop Aspire        1
Name: count, dtype: int64
Total Count: 1289
----------------------------------------------------------------------------------------------------
AFTER
Column: brand
Unique Values and Counts:
brand
Lenovo     369
HP         252
Asus       187
Dell       186
Acer       151
MSI         96
Macbook     22
LG          16
Name: count, dtype: int64
Total Count: 1279
----------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column] = data[column].replace(brand_mapping)


In [None]:
combined_df.to_csv("../Data/data_2.csv", index=False)