In [19]:
import pandas as pd

# 1.LOAD DATA

In [20]:
files = {
    "bao_chau": "../data/raw/bao_chau_data.csv",
    "duy_thai": "../data/raw/duy_thai_data.csv",
    "minh_huy": "../data/raw/minh_huy_data.csv",
    "quoc_trung": "../data/raw/quoc_trung_data.csv"
}

def read_csv(path):
    return pd.read_csv(path, sep = None, engine='python')

dataframes = {name: read_csv(path) for name, path in files.items()}

for name, df in dataframes.items():
    print(name)
    print(df.info())

bao_chau
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1799 entries, 0 to 1798
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Ôªøname        1799 non-null   object
 1   description  1796 non-null   object
 2   brand        1799 non-null   object
 3   category     1799 non-null   object
dtypes: object(4)
memory usage: 56.3+ KB
None
duy_thai
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1746 entries, 0 to 1745
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Ôªøproduct_name  1746 non-null   object
 1   describe       1746 non-null   object
 2   brand          1746 non-null   object
 3   category       1746 non-null   object
dtypes: object(4)
memory usage: 54.7+ KB
None
minh_huy
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1947 entries, 0 to 1946
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------ 

In [21]:
def normalize_product_df(df):
    # 1. Delete BOM characters
    df.columns = (
        df.columns
        .str.replace("\ufeff", "", regex=False)  
        .str.replace("Ôªø", "", regex=False)        
        .str.strip()
        .str.lower()
    )
    # 2. Rename columns to standard names
    col_map = {}
    for col in df.columns:
        if col in ["name", "product_name", "product name"]:
            col_map[col] = "product_name"
        elif col in ["describe", "description"]:
            col_map[col] = "description"
    df = df.rename(columns=col_map)
    # 3. Drop unnecessary columns
    if "id" in df.columns:
        df = df.drop(columns=["id"])
    # 4. Ensure all required columns are present
    for col in ["product_name", "description", "brand", "category"]:
        if col not in df.columns:
            df[col] = None

    return df[["product_name", "description", "brand", "category"]]

In [22]:
EXPECTED_COLS = ["product_name", "description", "brand", "category"]

dfs_clean = {}

for name, df in dataframes.items():
    clean_df = normalize_product_df(df)
    dfs_clean[name] = clean_df

    print(f"\n=== {name} (after normalize) ===")
    clean_df.info()


=== bao_chau (after normalize) ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1799 entries, 0 to 1798
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  1799 non-null   object
 1   description   1796 non-null   object
 2   brand         1799 non-null   object
 3   category      1799 non-null   object
dtypes: object(4)
memory usage: 56.3+ KB

=== duy_thai (after normalize) ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1746 entries, 0 to 1745
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  1746 non-null   object
 1   description   1746 non-null   object
 2   brand         1746 non-null   object
 3   category      1746 non-null   object
dtypes: object(4)
memory usage: 54.7+ KB

=== minh_huy (after normalize) ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1947 entries, 0 to 1946
Data columns (total 

In [23]:
df = pd.concat(dfs_clean.values(), ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7289 entries, 0 to 7288
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  7289 non-null   object
 1   description   7286 non-null   object
 2   brand         7285 non-null   object
 3   category      7289 non-null   object
dtypes: object(4)
memory usage: 227.9+ KB


# 2. X·ª¨ L√ù MISSING VALUES

In [24]:
df.isnull().sum()

product_name    0
description     3
brand           4
category        0
dtype: int64

In [25]:
num_null_brand = df["brand"].isna().sum()
ratio = num_null_brand / len(df) * 100
print(f"T·ª∑ l·ªá thi·∫øu brand: {ratio:.2f}%")

T·ª∑ l·ªá thi·∫øu brand: 0.05%


H∆∞·ªõng x·ª≠ l√Ω:
- `description`: Thay th·∫ø gi√° tr·ªã thi·∫øu b·∫±ng n·ªôi dung t·ª´ c·ªôt `product_name`. Do s·ªë l∆∞·ª£ng thi·∫øu √≠t v√† t√™n s·∫£n ph·∫©m th∆∞·ªùng ch·ª©a c√°c t·ª´ kh√≥a quan tr·ªçng nh·∫•t
- `brand`: ƒêi·ªÅn gi√° tr·ªã `"Unknown"` ƒë·ªÉ t·∫°o th√†nh m·ªôt nh√≥m ri√™ng bi·ªát

In [26]:
df['description'] = df['description'].fillna(df['product_name'])
df['brand'] = df['brand'].fillna('Unknown')

In [27]:
#KI·ªÇM TRA L·∫†I
print("S·ªë l∆∞·ª£ng missing values sau khi x·ª≠ l√Ω:")
print(df.isnull().sum())

print(f"\nK√≠ch th∆∞·ªõc d·ªØ li·ªáu hi·ªán t·∫°i: {df.shape}")

S·ªë l∆∞·ª£ng missing values sau khi x·ª≠ l√Ω:
product_name    0
description     0
brand           0
category        0
dtype: int64

K√≠ch th∆∞·ªõc d·ªØ li·ªáu hi·ªán t·∫°i: (7289, 4)


# 3. X·ª¨ L√ù DUPLICATES

In [28]:
dup_mask = df.duplicated(subset=["product_name", "brand", "category"], keep=False)

dup_by_category = (
    df[dup_mask]
    .groupby("category")
    .size()
    .reset_index(name="num_duplicates")
    .sort_values("num_duplicates", ascending=False)
)

print(f"T·ªïng s·ªë d√≤ng tr√πng: {dup_mask.sum()}")
print(f"T·ªâ l·ªá d√≤ng tr√πng: {dup_mask.sum() / len(df) * 100:.2f}%")
dup_by_category

T·ªïng s·ªë d√≤ng tr√πng: 22
T·ªâ l·ªá d√≤ng tr√πng: 0.30%


Unnamed: 0,category,num_duplicates
1,Nh√† c·ª≠a & ƒê·ªùi s·ªëng,16
2,Th·ªÉ thao & Du l·ªãch,4
0,M·ªπ ph·∫©m & L√†m ƒë·∫πp,2


H∆∞·ªõng gi·∫£i quy·∫øt:
- Do t·ª∑ l·ªá tr√πng l·∫∑p nh·ªè, n√™n l·ª±a ch·ªçn lo·∫°i b·ªè c√°c d√≤ng tr√πng l·∫∑p ƒë·ªÉ d·ªØ li·ªáu s·∫°ch h∆°n

In [29]:
df_cleaned = df.drop_duplicates(
    subset=['product_name', 'brand', 'category'],
    keep='first'
).copy()

print(f"T·ªïng s·ªë d√≤ng sau khi lo·∫°i duplicates: {len(df_cleaned)}")
print(f"ƒê√£ lo·∫°i b·ªè: {len(df) - len(df_cleaned)} d√≤ng")

T·ªïng s·ªë d√≤ng sau khi lo·∫°i duplicates: 7278
ƒê√£ lo·∫°i b·ªè: 11 d√≤ng


# 4. CHU·∫®N H√ìA TEXT TI·∫æNG VI·ªÜT

In [30]:
import re
from bs4 import BeautifulSoup

In [31]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # 1. X·ª≠ l√Ω HTML tags
    try:
        soup = BeautifulSoup(text, "html.parser")
        for data in soup(['style', 'script', 'head', 'title', 'meta', '[document]']):
            data.decompose()
        text = soup.get_text(" ")
    except:
        pass    
    # 2. Chuy·ªÉn th√†nh ch·ªØ th∆∞·ªùng
    text = text.lower()
    # 3. Lo·∫°i b·ªè URL v√† email
    text = re.sub(r"http\S+|www\S+|\S+@\S+", " ", text)
    # 4. Lo·∫°i b·ªè c√°c k√Ω t·ª± xu·ªëng d√≤ng, tab ƒë·∫∑c bi·ªát (\n, \t, \r, \v)
    text = re.sub(r"[\n\t\r\v]+", " ", text)
    # 5. Chu·∫©n h√≥a c√°c t·ª´ n·ªëi d√†i
    text = re.sub(r'(\w)\1{2,}', r'\1', text)
    # 6. Lo·∫°i b·ªè k√Ω t·ª± ƒë·∫∑c bi·ªát (gi·ªØ ti·∫øng Vi·ªát + s·ªë)
    text = re.sub(
        r"[^0-9a-z√†√°·∫°·∫£√£√¢·∫ß·∫•·∫≠·∫©·∫´ƒÉ·∫±·∫Ø·∫∑·∫≥·∫µ√®√©·∫π·∫ª·∫Ω√™·ªÅ·∫ø·ªá·ªÉ·ªÖ√¨√≠·ªã·ªâƒ©√≤√≥·ªç·ªè√µ√¥·ªì·ªë·ªô·ªï·ªó∆°·ªù·ªõ·ª£·ªü·ª°√π√∫·ª•·ªß≈©∆∞·ª´·ª©·ª±·ª≠·ªØ·ª≥√Ω·ªµ·ª∑·ªπƒë\s]",
        " ",
        text
    )
    # 7. Lo·∫°i b·ªè s·ªë ƒëi·ªán tho·∫°i, m√£ s·ªë...
    text = re.sub(r'\d{10,}', '', text)
    # 8. Lo·∫°i b·ªè kho·∫£ng tr·∫Øng th·ª´a
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [32]:
for col in ["product_name", "description"]:
    df_cleaned.loc[:, col] = df_cleaned[col].apply(clean_text)
df_cleaned.head()

Unnamed: 0,product_name,description,brand,category
0,ƒë·∫ßm ti·ªÉu th∆∞ ph·ªëi c·ªï sang ch·∫£nh thi·∫øt k·∫ø ƒëi l√†...,th√¥ng tin ng∆∞·ªùi m·∫∑c size s s ·∫£ n ph ·∫© m v√≤ng e...,OEM,Th·ªùi trang n·ªØ
1,ƒë·∫ßm maxi c·ªï vu√¥ng th·∫Øt n∆° ph·ªëi vi·ªÅn sang tr·ªçng,th√¥ng tin ng∆∞·ªùi m·∫∑c size s s ·∫£ n ph ·∫© m v√≤ng e...,OEM,Th·ªùi trang n·ªØ
2,ƒë·∫ßm linen d√°ng su√¥ng c·ªè v c√≥ t√∫i 2 b√™n s∆∞·ªùn k√®...,shop chuy√™n cung c·∫•p qu·∫ßn √¢u √°o s∆° mi √°o ki·ªÉu ...,ARCTIC HUNTER,Th·ªùi trang n·ªØ
3,ƒë·∫ßm su√¥ng n·ªØ thi·∫øt k·∫ø r√∫t eo ti·ªán l·ª£i c√≥ tui 2...,shop chuy√™n cung c·∫•p qu·∫ßn √¢u √°o s∆° mi √°o ki·ªÉu ...,ARCTIC HUNTER,Th·ªùi trang n·ªØ
4,ƒë·∫ßm s∆° mi su√¥ng d√°ng x√≤e thi·∫øt k·∫ø tay s·∫Øn ph·ªëi...,n·ªôi dung s·∫£n ph·∫©m thanh l·ªãch h∆°n v·ªõi s·ª£i v·∫£i t...,ARCTIC HUNTER,Th·ªùi trang n·ªØ


Nh·∫≠n x√©t:
- C·ªôt `Description` c√≤n nhi·ªÅu d√≤ng b·∫Øt ƒë·∫ßu b·∫±ng c√°c c·ª•m t·ª´ kh√¥ng c√≥ √Ω nghƒ©a, l·∫∑p l·∫°i nhi·ªÅu l·∫ßn: "th√¥ng tin s·∫£n ph·∫©m", "gi·ªõi thi·ªáu s·∫£n ph·∫©m", "chi ti·∫øt s·∫£n ph·∫©m"...
- L·ª±a ch·ªçn lo·∫°i b·ªè ƒë·ªÉ th√¥ng tin gi√° tr·ªã h∆°n

In [33]:
def remove_boilerplate(text):
    if not isinstance(text, str) or len(text) == 0:
        return ""
    patterns = [
        r"^m√¥ t·∫£ s·∫£n ph·∫©m",
        r"^th√¥ng tin s·∫£n ph·∫©m", 
        r"^chi ti·∫øt s·∫£n ph·∫©m",
        r"^th√¥ng s·ªë k·ªπ thu·∫≠t",
        r"^gi·ªõi thi·ªáu s·∫£n ph·∫©m"
    ]
    combined_pattern = "|".join(patterns)
    text = re.sub(combined_pattern, "", text).strip()
    return text

# √Åp d·ª•ng CH·ªà cho c·ªôt description
df_cleaned['description'] = df_cleaned['description'].apply(remove_boilerplate)

In [34]:
#KI·ªÇM TRA L·∫†I SAU X·ª¨ L√ù C√ì CHU·ªñI R·ªñNG KH√îNG
def check_empty_strings(df):
    for col in df.columns:
        if df[col].dtype == "object":
            num_empty = (df[col].str.strip() == "").sum()
            print(f"S·ªë chu·ªói r·ªóng trong c·ªôt '{col}': {num_empty}")
check_empty_strings(df_cleaned)

S·ªë chu·ªói r·ªóng trong c·ªôt 'product_name': 0
S·ªë chu·ªói r·ªóng trong c·ªôt 'description': 0
S·ªë chu·ªói r·ªóng trong c·ªôt 'brand': 0
S·ªë chu·ªói r·ªóng trong c·ªôt 'category': 0


# 5. L·ªåC C√ÅC S·∫¢N PH·∫®M KH√îNG ƒê·ª¶ ƒêI·ªÄU KI·ªÜN
Nh·∫≠n x√©t:    
- M·ªôt s·ªë s·∫£n ph·∫©m c√≥ t√™n qu√° ng·∫Øn ho·∫∑c m√¥ t·∫£ qu√° √≠t t·ª´, hay t√™n ch·ªâ l√† s·ªë, kh√¥ng ƒë·ªß th√¥ng tin.  
- H∆∞·ªõng x·ª≠ l√Ω: lo·∫°i b·ªè c√°c s·∫£n ph·∫©m kh√¥ng ƒë·∫°t ti√™u chu·∫©n t·ªëi thi·ªÉu (t√™n ‚â•2 t·ª´, m√¥ t·∫£ ‚â•5 t·ª´, t√™n kh√¥ng ph·∫£i s·ªë) ƒë·ªÉ ƒë·∫£m b·∫£o d·ªØ li·ªáu s·∫°ch v√† ch·∫•t l∆∞·ª£ng h∆°n.


In [35]:
df_cleaned['name_word_count'] = df_cleaned['product_name'].str.split().str.len()
df_cleaned['desc_word_count'] = df_cleaned['description'].str.split().str.len()

df_cleaned = df_cleaned[
    (df_cleaned['name_word_count'] >= 2) &  # T√™n √≠t nh·∫•t 2 t·ª´
    (df_cleaned['desc_word_count'] >= 5)  &   # M√¥ t·∫£ √≠t nh·∫•t 5 t·ª´
    (df_cleaned['product_name'].str.isnumeric() == False)  # T√™n kh√¥ng ph·∫£i l√† s·ªë
].copy()
print(f"S·ªë d√≤ng sau khi lo·∫°i b·ªè s·∫£n ph·∫©m kh√¥ng ƒë·ªß ti√™u chu·∫©n: {len(df_cleaned)}")
print(f"S·ªë d√≤ng ƒë√£ lo·∫°i b·ªè: {len(df) - len(df_cleaned)}")

S·ªë d√≤ng sau khi lo·∫°i b·ªè s·∫£n ph·∫©m kh√¥ng ƒë·ªß ti√™u chu·∫©n: 7274
S·ªë d√≤ng ƒë√£ lo·∫°i b·ªè: 15


# ƒê·ªïi t√™n c·ªôt 'name' th√†nh 'product_name' ƒë·ªÉ kh·ªõp v·ªõi model_training.ipynb
df_cleaned = df_cleaned.rename(columns={'name': 'product_name'})

# Ch·ªçn c√°c c·ªôt c·∫ßn l∆∞u
columns_to_save = ['product_name', 'description', 'brand', 'category', 'text', 'source']
columns_to_save = [col for col in columns_to_save if col in df_cleaned.columns]

# L∆∞u v√†o folder processed
output_path = '../data/processed/data_cleaned.csv'
df_cleaned[columns_to_save].to_csv(output_path, index=False, encoding='utf-8')

print(f"\n‚úÖ ƒê√£ l∆∞u d·ªØ li·ªáu v√†o: {output_path}")
print(f"üìä S·ªë rows: {len(df_cleaned)}")
print(f"üìä S·ªë columns: {len(columns_to_save)}")

In [36]:
columns_to_save = ['product_name', 'description', 'brand', 'category']
df_cleaned[columns_to_save].to_csv('data_cleaned.csv', index=False, encoding='utf-8')