Ki·ªÉm tra t·ªïng th·ªÉ csv (Minh truc meeyland)

In [66]:
import pandas as pd
import numpy as np
import re
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [67]:
def inspect_csv(df):
    print("===== CSV OVERVIEW =====")
    print(f"Rows    : {df.shape[0]}")
    print(f"Columns : {df.shape[1]}")
    print("\n--- Data types ---")
    print(df.dtypes)
    print("\n--- Memory usage ---")
    print(round(df.memory_usage(deep=True).sum() / 1024**2, 2), "MB")


In [68]:
df_raw = pd.read_csv("../data/raw/meeyland_hcm_total.csv")
inspect_csv(df_raw)


===== CSV OVERVIEW =====
Rows    : 7521
Columns : 22

--- Data types ---
id                    int64
Page                  int64
Title                object
Price_Raw            object
Price_Billion       float64
Price_per_m2         object
Area_m2             float64
District             object
Address              object
Bedrooms            float64
Toilets             float64
Post_Time            object
Link                 object
Description          object
M·∫∑t ti·ªÅn             object
Chi·ªÅu s√¢u            object
M·ª©c ƒë·ªô giao d·ªãch     object
∆Øu ƒëi·ªÉm BƒêS          object
H∆∞·ªõng ban c√¥ng       object
Ti·ªán √≠ch             object
N·ªôi th·∫•t             object
S·ªë ban c√¥ng         float64
dtype: object

--- Memory usage ---
18.07 MB


H√†m ki·ªÉm tra & TH√îNG B√ÅO s·ªë √¥ thi·∫øu d·ªØ li·ªáu theo t·ª´ng c·ªôt

In [69]:
def report_missing_values(df):
    print("===== MISSING VALUE REPORT =====")
    missing = df.isna().sum()
    percent = (missing / len(df)) * 100
    report = pd.DataFrame({
        "Missing_Count": missing,
        "Percent_Missing (%)": percent.round(2)
    }).sort_values(by="Missing_Count", ascending=False)
    print(report)
    return report


In [70]:
report_missing_values(df_raw)

===== MISSING VALUE REPORT =====
                  Missing_Count  Percent_Missing (%)
H∆∞·ªõng ban c√¥ng             7200                95.73
S·ªë ban c√¥ng                7166                95.28
N·ªôi th·∫•t                   7155                95.13
Ti·ªán √≠ch                   6828                90.79
District                   6055                80.51
Chi·ªÅu s√¢u                  4563                60.67
Toilets                    4166                55.39
Bedrooms                   3686                49.01
M·ª©c ƒë·ªô giao d·ªãch           3524                46.86
∆Øu ƒëi·ªÉm BƒêS                3514                46.72
M·∫∑t ti·ªÅn                   2998                39.86
Price_Billion               808                10.74
Price_Raw                   808                10.74
Price_per_m2                774                10.29
Post_Time                   559                 7.43
Description                 559                 7.43
Area_m2                     559  

Unnamed: 0,Missing_Count,Percent_Missing (%)
H∆∞·ªõng ban c√¥ng,7200,95.73
S·ªë ban c√¥ng,7166,95.28
N·ªôi th·∫•t,7155,95.13
Ti·ªán √≠ch,6828,90.79
District,6055,80.51
Chi·ªÅu s√¢u,4563,60.67
Toilets,4166,55.39
Bedrooms,3686,49.01
M·ª©c ƒë·ªô giao d·ªãch,3524,46.86
∆Øu ƒëi·ªÉm BƒêS,3514,46.72


In [71]:
mask = df_raw['Post_Time'].isna() & df_raw['Description'].isna() & df_raw['Area_m2'].isna()

mask.sum(), len(df_raw)


(np.int64(559), 7521)

H√†m chuy·ªÉn ƒë·ªïi Price

In [72]:
def parse_price_raw(price_raw):
    if pd.isna(price_raw):
        return np.nan
    text = str(price_raw).lower().strip()

    match = re.search(r'([\d\.,]+)\s*(t·ª∑|ty)', text)
    if match:
        return float(match.group(1).replace(',', '.'))

    match = re.search(r'([\d\.,]+)\s*(tri·ªáu|trieu)', text)
    if match:
        return float(match.group(1).replace(',', '.')) / 1000

    return np.nan


In [73]:
def clean_price(df):
    print("===== PRICE CLEANING =====")

    mask = df['Price_Billion'].isna() & df['Price_Raw'].notna()
    df.loc[mask, 'Price_Billion'] = df.loc[mask, 'Price_Raw'].apply(parse_price_raw)

    mask = df['Price_Raw'].isna() & df['Price_Billion'].notna()
    df.loc[mask, 'Price_Raw'] = df.loc[mask, 'Price_Billion'] \
        .apply(lambda x: f"{x:.2f}".replace('.', ',') + " t·ª∑")

    before = len(df)
    df = df.dropna(subset=['Price_Raw', 'Price_Billion'], how='all')
    print(f"Dropped rows (missing both prices): {before - len(df)}")

    return df


H√†m extract di·ªán t√≠ch t·ª´ text

In [74]:
def extract_area_from_text(text):
    if pd.isna(text):
        return np.nan
    text = str(text).lower()
    match = re.search(r'(\d{2,4})\s*(m2|m¬≤)', text)
    if match:
        return float(match.group(1))
    return np.nan


In [75]:
def clean_area(df):
    print("===== AREA CLEANING =====")

    mask = df['Area_m2'].isna() & df['Title'].notna()
    df.loc[mask, 'Area_m2'] = df.loc[mask, 'Title'].apply(extract_area_from_text)

    mask = df['Area_m2'].isna() & df['Description'].notna()
    df.loc[mask, 'Area_m2'] = df.loc[mask, 'Description'].apply(extract_area_from_text)

    before = len(df)
    df = df.dropna(subset=['Area_m2'])
    print(f"Dropped rows (missing Area): {before - len(df)}")

    df = df[(df['Area_m2'] > 10) & (df['Area_m2'] < 1000)]
    return df


Ki·ªÉm tra t√¨nh tr·∫°ng thi·∫øu Bedrooms & Toilets

H√†m extract Bedrooms t·ª´ text

In [76]:
def extract_bedrooms_from_text(text):
    if pd.isna(text):
        return np.nan
    text = str(text).lower()

    patterns = [
        r'(\d+)\s*(ph√≤ng ng·ªß|phong ngu)',
        r'(\d+)\s*pn\b',
        r'pn\s*[:\-]?\s*(\d+)',
        r'(\d+)\s*(bedroom|br)\b'
    ]

    for p in patterns:
        m = re.search(p, text)
        if m:
            return float(m.group(1))
    return np.nan


In [77]:
def clean_bedrooms(df):
    print("===== BEDROOM CLEANING =====")
    df['Bedrooms'] = pd.to_numeric(df['Bedrooms'], errors='coerce')

    mask = df['Bedrooms'].isna() & df['Title'].notna()
    df.loc[mask, 'Bedrooms'] = df.loc[mask, 'Title'].apply(extract_bedrooms_from_text)

    mask = df['Bedrooms'].isna() & df['Description'].notna()
    df.loc[mask, 'Bedrooms'] = df.loc[mask, 'Description'].apply(extract_bedrooms_from_text)

    return df


H√†m extract Toilets t·ª´ text

In [78]:
def extract_toilets_from_text(text):
    if pd.isna(text):
        return np.nan
    text = str(text).lower()

    patterns = [
        r'(\d+)\s*(wc|toilet|toilets)',
        r'(\d+)\s*(nh√† v·ªá sinh|nha ve sinh)',
        r'(\d+)\s*(ph√≤ng v·ªá sinh|phong ve sinh)',
        r'wc\s*[:\-]?\s*(\d+)',
        r'vs\s*[:\-]?\s*(\d+)'
    ]

    for p in patterns:
        m = re.search(p, text)
        if m:
            return float(m.group(1))
    return np.nan


In [79]:
def clean_toilets(df):
    print("===== TOILETS CLEANING =====")
    df['Toilets'] = pd.to_numeric(df['Toilets'], errors='coerce')

    mask = df['Toilets'].isna() & df['Title'].notna()
    df.loc[mask, 'Toilets'] = df.loc[mask, 'Title'].apply(extract_toilets_from_text)

    mask = df['Toilets'].isna() & df['Description'].notna()
    df.loc[mask, 'Toilets'] = df.loc[mask, 'Description'].apply(extract_toilets_from_text)

    return df


In [80]:
def check_bedroom_toilet_missing(df):
    b = df['Bedrooms'].isna()
    t = df['Toilets'].isna()

    print("===== BEDROOM / TOILET STATUS =====")
    print("Missing BOTH     :", (b & t).sum())
    print("Missing ONLY ONE :", (b ^ t).sum())
    print("Complete BOTH    :", (~b & ~t).sum())


KNN bedroom v√† toilet

In [81]:
def parse_price_per_m2(x):
    if pd.isna(x):
        return np.nan

    x = str(x).lower()

    num = re.findall(r'[\d,.]+', x)
    if not num:
        return np.nan

    value = num[0].replace(',', '.')
    try:
        value = float(value)
    except:
        return np.nan

    # Quy ƒë·ªïi v·ªÅ tri·ªáu / m2
    if 't·ª∑' in x:
        return value * 1000
    elif 'tri·ªáu' in x:
        return value
    else:
        return np.nan

In [82]:
knn_features = [
    'Area_m2',
    'Price_Billion',
    'Bedrooms',
    'Toilets'
]


knn_df = df[knn_features].copy()
knn_df = knn_df.apply(pd.to_numeric, errors='coerce')

In [83]:
knn_df = df[knn_features].copy()
knn_df = knn_df.apply(pd.to_numeric, errors='coerce')


In [84]:
knn_df = knn_df.dropna(
    how='all',
    subset=['Area_m2', 'Price_Billion']
)


In [85]:
scaler = StandardScaler()
knn_scaled = scaler.fit_transform(knn_df)


In [86]:
imputer = KNNImputer(
    n_neighbors=5,
    weights='distance'
)

knn_imputed = imputer.fit_transform(knn_scaled)
knn_imputed = scaler.inverse_transform(knn_imputed)


In [87]:
knn_imputed_df = pd.DataFrame(
    knn_imputed,
    columns=knn_features,
    index=knn_df.index
)


In [88]:
df.loc[df['Bedrooms'].isna(), 'Bedrooms'] = (
    knn_imputed_df.loc[df['Bedrooms'].isna(), 'Bedrooms']
    .round()
    .clip(lower=0, upper=10)
)

df.loc[df['Toilets'].isna(), 'Toilets'] = (
    knn_imputed_df.loc[df['Toilets'].isna(), 'Toilets']
    .round()
    .clip(lower=0, upper=10)
)

In [89]:
df[['Bedrooms', 'Toilets']].isna().sum()


Bedrooms    0
Toilets     0
dtype: int64

Ki·ªÉm tra datatype

Drop row thi·∫øu Bedrooms HO·∫∂C Toilets

In [90]:
def drop_missing_bed_toilet(df):
    print("===== DROP MISSING BEDROOM / TOILET =====")
    before = len(df)
    df = df[df['Bedrooms'].notna() & df['Toilets'].notna()].copy()
    print(f"Total BEFORE : {before}")
    print(f"Total AFTER  : {len(df)}")
    print(f"Dropped      : {before - len(df)}")
    return df


In [91]:


df = clean_price(df)
df = clean_area(df)
df = clean_bedrooms(df)
df = clean_toilets(df)

report_missing_values(df)
check_bedroom_toilet_missing(df)

df_ready = drop_missing_bed_toilet(df)


===== PRICE CLEANING =====
Dropped rows (missing both prices): 0
===== AREA CLEANING =====
Dropped rows (missing Area): 0
===== BEDROOM CLEANING =====
===== TOILETS CLEANING =====
===== MISSING VALUE REPORT =====
                  Missing_Count  Percent_Missing (%)
H∆∞·ªõng ban c√¥ng             6253                95.16
S·ªë ban c√¥ng                6219                94.64
N·ªôi th·∫•t                   6207                94.46
Ti·ªán √≠ch                   5889                89.62
District                   5255                79.97
Chi·ªÅu s√¢u                  3728                56.73
∆Øu ƒëi·ªÉm BƒêS                2764                42.06
M·ª©c ƒë·ªô giao d·ªãch           2708                41.21
M·∫∑t ti·ªÅn                   2285                34.77
Price_per_m2                117                 1.78
id                            0                 0.00
Price_Raw                     0                 0.00
Title                         0                 0.00
Page        

  df.loc[mask, 'Bedrooms'] = df.loc[mask, 'Title'].apply(extract_bedrooms_from_text)
  df.loc[mask, 'Toilets'] = df.loc[mask, 'Title'].apply(extract_toilets_from_text)


DATASET CU·ªêI

In [92]:
print("FINAL ROWS:", len(df_ready))
df_ready.head()


FINAL ROWS: 6571


Unnamed: 0,id,Page,Title,Price_Raw,Price_Billion,Price_per_m2,Area_m2,District,Address,Bedrooms,...,Link,Description,M·∫∑t ti·ªÅn,Chi·ªÅu s√¢u,M·ª©c ƒë·ªô giao d·ªãch,∆Øu ƒëi·ªÉm BƒêS,H∆∞·ªõng ban c√¥ng,Ti·ªán √≠ch,N·ªôi th·∫•t,S·ªë ban c√¥ng
0,306038613,1,"Si√™u ph·∫©m nh√† m·∫∑t ti·ªÅn ƒë∆∞·ªùng S·ªë 36 T√™n L·ª≠a, s·ªü...","7,8 t·ª∑",7.8,162.5,48.0,,"Q. B√¨nh T√¢n, Tp. H·ªì Ch√≠ Minh",4.0,...,https://meeyland.com/ban-nha-mat-pho-binh-tan-...,"Si√™u ph·∫©m nh√† m·∫∑t ti·ªÅn ƒë∆∞·ªùng S·ªë 36 T√™n L·ª≠a, s·ªü...",4m,12m,C√†ng s·ªõm c√†ng t·ªët,Kinh doanh ƒë∆∞·ª£c,,,,
1,306041109,1,T·ªça l·∫°c t·∫°i v·ªã tr√≠ ƒë·∫Øc ƒë·ªãa tr√™n ƒë∆∞·ªùng C√°ch M·∫°n...,5 t·ª∑,5.0,148.81,33.6,,"Q. 3, Tp. H·ªì Ch√≠ Minh",3.0,...,https://meeyland.com/ban-nha-rieng-quan-3-ho-c...,T·ªça l·∫°c t·∫°i v·ªã tr√≠ ƒë·∫Øc ƒë·ªãa tr√™n ƒë∆∞·ªùng C√°ch M·∫°n...,"4,2m",8m,C√†ng s·ªõm c√†ng t·ªët,Giao th√¥ng thu·∫≠n l·ª£i,,,,
2,306041245,1,Nh√† 2 t·∫ßng t·ªça l·∫°c t·∫°i v·ªã tr√≠ ƒë·∫Øc ƒë·ªãa ngay c·∫°n...,"3,88 t·ª∑",3.88,71.85,54.0,,"ƒê. ƒê·∫•t M·ªõi, Q. B√¨nh T√¢n, Tp. H·ªì Ch√≠ Minh",2.0,...,https://meeyland.com/ban-nha-rieng-binh-tan-ho...,T·ªça l·∫°c t·∫°i v·ªã tr√≠ ƒë·∫Øc ƒë·ªãa ngay c·∫°nh Ch·ª£ L√™ VƒÉ...,4m,"13,5m",C√†ng s·ªõm c√†ng t·ªët,"Kinh doanh ƒë∆∞·ª£c, Giao th√¥ng thu·∫≠n l·ª£i",,,,
3,104652329,1,"BaÃÅn cƒÉn h·ªô d·ªãch v·ª•, haÃ£ 1 tyÃâ g√¢ÃÄn Phan VƒÉn T...",21 t·ª∑,21.0,175.0,120.0,,"ƒê. Phan VƒÉn Tr·ªã, P. 12, Q. B√¨nh Th·∫°nh, Tp. H·ªì ...",22.0,...,https://meeyland.com/ban-nha-mat-pho-binh-than...,"BaÃÅn cƒÉn h·ªô d·ªãch v·ª•, haÃ£ 1 tyÃâ g√¢ÃÄn Phan VƒÉn T...","4,7m","25,53m",C√†ng s·ªõm c√†ng t·ªët,"ƒêang c√≥ l·ª£i nhu·∫≠n cao, D√¢n tr√≠ cao, Giao th√¥ng...",,,,
4,102855901,1,"B√°n nh√† h·∫ªm ba g√°c, ƒë∆∞·ªùng 14, B√¨nh Tr∆∞ng T√¢y, ...","5,6 t·ª∑",5.6,64.37,87.0,,Thu g·ªçn,2.0,...,https://meeyland.com/ban-nha-rieng-thu-duc-ho-...,"B√°n nh√† h·∫ªm ba g√°c, ƒë∆∞·ªùng 14, B√¨nh Tr∆∞ng T√¢y, ...",4m,"21,75m",C√†ng s·ªõm c√†ng t·ªët,"Cho thu√™ gi√° cao, Quy ho·∫°ch ·ªïn ƒë·ªãnh, Giao th√¥n...",T√¢y,,,


Cleaning Address

In [93]:
def extract_district_final(address):
    if pd.isna(address):
        return np.nan

    # 1. Ti·ªÅn x·ª≠ l√Ω: X√≥a "xem th√™m", ƒë∆∞a v·ªÅ ch·ªØ th∆∞·ªùng, n√©n nhi·ªÅu kho·∫£ng tr·∫Øng th√†nh 1
    text = str(address).lower()
    text = text.replace("xem th√™m", "")
    text = re.sub(r'\s+', ' ', text).strip()

    # 2. X·ª≠ l√Ω ƒë·∫∑c bi·ªát cho Th·ªß ƒê·ª©c
    if 'th·ªß ƒë·ª©c' in text:
        return 'Th√†nh ph·ªë Th·ªß ƒê·ª©c'

    # 3. D√πng Regex ƒë·ªÉ b·∫Øt Qu·∫≠n/Q + S·ªë (Ch·∫•p nh·∫≠n c√≥ ho·∫∑c kh√¥ng c√≥ kho·∫£ng tr·∫Øng ·ªü gi·ªØa)
    # \s* ƒë·∫°i di·ªán cho vi·ªác c√≥ 0 ho·∫∑c nhi·ªÅu kho·∫£ng tr·∫Øng
    # ([0-9]{1,2}) b·∫Øt 1 ho·∫∑c 2 ch·ªØ s·ªë
    match_num = re.search(r'(?:qu·∫≠n|q)\s*([0-9]{1,2})', text)
    if match_num:
        num = int(match_num.group(1)) # √âp ki·ªÉu int ƒë·ªÉ "01" th√†nh "1"
        return f"Qu·∫≠n {num}"

    # 4. N·∫øu kh√¥ng ph·∫£i qu·∫≠n s·ªë, ki·ªÉm tra danh s√°ch Qu·∫≠n t√™n ch·ªØ (Whitelist)
    # C√°ch n√†y an to√†n nh·∫•t ƒë·ªÉ tr√°nh "Huy·ªán H√™m", "Huy·ªán Th·∫°nh"
    districts_vietnamese = {
        'b√¨nh th·∫°nh': 'Qu·∫≠n B√¨nh Th·∫°nh',
        'g√≤ v·∫•p': 'Qu·∫≠n G√≤ V·∫•p',
        't√¢n b√¨nh': 'Qu·∫≠n T√¢n B√¨nh',
        't√¢n ph√∫': 'Qu·∫≠n T√¢n Ph√∫',
        'b√¨nh t√¢n': 'Qu·∫≠n B√¨nh T√¢n',
        'ph√∫ nhu·∫≠n': 'Qu·∫≠n Ph√∫ Nhu·∫≠n',
        'b√¨nh ch√°nh': 'Huy·ªán B√¨nh Ch√°nh',
        'h√≥c m√¥n': 'Huy·ªán H√≥c M√¥n',
        'c·ªß chi': 'Huy·ªán C·ªß Chi',
        'nh√† b√®': 'Huy·ªán Nh√† B√®',
        'c·∫ßn gi·ªù': 'Huy·ªán C·∫ßn Gi·ªù'
    }

    for key, value in districts_vietnamese.items():
        if key in text:
            return value

    return np.nan

In [94]:
# C·∫≠p nh·∫≠t l·∫°i nh·ªØng d√≤ng ƒëang b·ªã sai ho·∫∑c thi·∫øu
df_ready['District'] = df_ready['Address'].apply(extract_district_final)

# Ki·ªÉm tra l·∫°i 10 m·∫´u ng·∫´u nhi√™n
print(df_ready[['District', 'Address']].sample(10))

# Xem danh s√°ch c√°c Qu·∫≠n ƒë√£ ƒë∆∞·ª£c gom nh√≥m
print(df_ready['District'].unique())

               District                                            Address
4077  Th√†nh ph·ªë Th·ªß ƒê·ª©c  ƒê. 385, P. TƒÉng Nh∆°n Ph√∫ A, Tp. Th·ªß ƒê·ª©c, Tp. H...
79         Huy·ªán C·ªß Chi                         H. C·ªß Chi, Tp. H·ªì Ch√≠ Minh
1829                NaN                                            Thu g·ªçn
4634  Th√†nh ph·ªë Th·ªß ƒê·ª©c  ƒê. L√Ω T·∫ø Xuy√™n, P. Linh ƒê√¥ng, Tp. Th·ªß ƒê·ª©c, Tp....
693         Qu·∫≠n G√≤ V·∫•p         ƒê. S·ªë 1, P. 16, Q. G√≤ V·∫•p, Tp. H·ªì Ch√≠ Minh
383       Qu·∫≠n B√¨nh T√¢n  ƒê. 6, P. B√¨nh H∆∞ng Ho√† B, Q. B√¨nh T√¢n, Tp. H·ªì ...
597                 NaN  ƒê. L√™ VƒÉn L∆∞∆°ng, P. T√¢n H∆∞ng, Q. 7, Tp. H·ªì Ch√≠...
968                 NaN                             Q. 12, Tp. H·ªì Ch√≠ Minh
7391      Qu·∫≠n T√¢n B√¨nh                P. 15, Q. T√¢n B√¨nh, Tp. H·ªì Ch√≠ Minh
5548        Qu·∫≠n G√≤ V·∫•p  ƒê. Ph·∫°m VƒÉn Chi√™u, P. 8, Q. G√≤ V·∫•p, Tp. H·ªì Ch√≠...
['Qu·∫≠n B√¨nh T√¢n' nan 'Qu·∫≠n B√¨nh Th·∫°nh' 'Qu·∫≠n T√¢n Ph

In [95]:

def extract_district_from_address(address):
    if pd.isna(address):
        return np.nan

    # 1. TI·ªÄN X·ª¨ L√ù: ƒê∆∞a v·ªÅ ch·ªØ th∆∞·ªùng, x√≥a "xem th√™m", n√©n kho·∫£ng tr·∫Øng
    text = str(address).lower()
    text = text.replace("xem th√™m", "")
    # X√≥a c√°c d·∫•u ch·∫•m sau c√°c t·ª´ vi·∫øt t·∫Øt ƒë·ªÉ d·ªÖ x·ª≠ l√Ω (q. -> q, h. -> h)
    text = re.sub(r'(?<=\b[qht])\.', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # 2. ∆ØU TI√äN CAO NH·∫§T: Th√†nh ph·ªë Th·ªß ƒê·ª©c
    # (Bao g·ªìm c√°c ki·ªÉu: tp th·ªß ƒë·ª©c, th√†nh ph·ªë th·ªß ƒë·ª©c, q th·ªß ƒë·ª©c...)
    if any(kw in text for kw in ['th·ªß ƒë·ª©c', 'thu duc']):
        return 'Th√†nh ph·ªë Th·ªß ƒê·ª©c'

    # 3. X·ª¨ L√ù QU·∫¨N S·ªê (Q1, Q 1, Q 01, Qu·∫≠n 1, Qu·∫≠n 01, Q.1...)
    # Gi·∫£i th√≠ch Regex: (?:qu·∫≠n|q) t√¨m 'qu·∫≠n' ho·∫∑c 'q'. \s* cho ph√©p nhi·ªÅu kho·∫£ng tr·∫Øng.
    # ([0-9]{1,2}) b·∫Øt l·∫•y 1 ho·∫∑c 2 ch·ªØ s·ªë.
    match_num = re.search(r'\b(?:qu·∫≠n|q)\s*([0-9]{1,2})\b', text)
    if match_num:
        num = int(match_num.group(1)) # √âp ki·ªÉu int ƒë·ªÉ bi·∫øn 01, 02 th√†nh 1, 2
        return f"Qu·∫≠n {num}"

    # 4. X·ª¨ L√ù QU·∫¨N/HUY·ªÜN T√äN CH·ªÆ (B√¨nh Th·∫°nh, G√≤ V·∫•p, Nh√† B√®...)
    # D√πng Whitelist ƒë·ªÉ tr√°nh b·∫Øt nh·∫ßm "Huy·ªán H√™m" t·ª´ "Xem th√™m"
    dist_map = {
        'b√¨nh th·∫°nh': 'Qu·∫≠n B√¨nh Th·∫°nh',
        'g√≤ v·∫•p': 'Qu·∫≠n G√≤ V·∫•p',
        't√¢n b√¨nh': 'Qu·∫≠n T√¢n B√¨nh',
        't√¢n ph√∫': 'Qu·∫≠n T√¢n Ph√∫',
        'b√¨nh t√¢n': 'Qu·∫≠n B√¨nh T√¢n',
        'ph√∫ nhu·∫≠n': 'Qu·∫≠n Ph√∫ Nhu·∫≠n',
        'b√¨nh ch√°nh': 'Huy·ªán B√¨nh Ch√°nh',
        'h√≥c m√¥n': 'Huy·ªán H√≥c M√¥n',
        'c·ªß chi': 'Huy·ªán C·ªß Chi',
        'nh√† b√®': 'Huy·ªán Nh√† B√®',
        'c·∫ßn gi·ªù': 'Huy·ªán C·∫ßn Gi·ªù',
        'qu·∫≠n 12': 'Qu·∫≠n 12' # ƒê·ªÅ ph√≤ng tr∆∞·ªùng h·ª£p qu·∫≠n s·ªë vi·∫øt ch·ªØ
    }

    for key, value in dist_map.items():
        if key in text:
            return value

    # 5. C√ÅC QU·∫¨N C√íN L·∫†I (Qu·∫≠n 1 ƒë·∫øn Qu·∫≠n 11 - n·∫øu vi·∫øt ki·ªÉu ch·ªØ)
    for i in range(1, 12):
        if f'qu·∫≠n {i}' in text or f'q {i}' in text:
            return f"Qu·∫≠n {i}"

    return np.nan

def clean_project_district(df):
    print("===== B·∫ÆT ƒê·∫¶U CHUY·ªÇN ƒê·ªîI ADDRESS -> DISTRICT =====")

    # T·∫°o mask: Nh·ªØng d√≤ng District ƒëang thi·∫øu NH∆ØNG Address c√≥ d·ªØ li·ªáu
    mask = df['District'].isna() & df['Address'].notna()
    print(f"S·ªë d√≤ng c·∫ßn x·ª≠ l√Ω: {mask.sum()}")

    # Th·ª±c hi·ªán chuy·ªÉn ƒë·ªïi
    extracted_data = df.loc[mask, 'Address'].apply(extract_district_from_address)

    # ƒêi·ªÅn v√†o c·ªôt District
    df.loc[mask, 'District'] = extracted_data

    print("K·∫øt qu·∫£ sau khi x·ª≠ l√Ω:")
    print(df['District'].value_counts())
    print(f"S·ªë d√≤ng v·∫´n c√≤n thi·∫øu District: {df['District'].isna().sum()}")

    return df

# --- TH·ª∞C THI ---
# Gi·∫£ s·ª≠ df_ready l√† dataframe c·ªßa b·∫°n
df_ready = clean_project_district(df_ready)

# Ki·ªÉm tra c√°c m·∫´u d·ªØ li·ªáu th·ª±c t·∫ø b·∫°n ƒë√£ ƒë∆∞a ra
test_indices = [1190, 877, 168, 1350, 644, 1167]
# L·ªçc nh·ªØng index c√≥ t·ªìn t·∫°i trong df_ready ƒë·ªÉ in ra ki·ªÉm tra
existing_indices = [i for i in test_indices if i in df_ready.index]
print("\nKI·ªÇM TRA M·∫™U SAU KHI FIX:")
print(df_ready.loc[existing_indices, ['District', 'Address']])

===== B·∫ÆT ƒê·∫¶U CHUY·ªÇN ƒê·ªîI ADDRESS -> DISTRICT =====
S·ªë d√≤ng c·∫ßn x·ª≠ l√Ω: 2141
K·∫øt qu·∫£ sau khi x·ª≠ l√Ω:
District
Qu·∫≠n B√¨nh Th·∫°nh      1118
Th√†nh ph·ªë Th·ªß ƒê·ª©c     890
Qu·∫≠n G√≤ V·∫•p           586
Qu·∫≠n T√¢n B√¨nh         378
Qu·∫≠n Ph√∫ Nhu·∫≠n        371
Qu·∫≠n B√¨nh T√¢n         348
Qu·∫≠n T√¢n Ph√∫          322
Qu·∫≠n 12               270
Qu·∫≠n 7                257
Qu·∫≠n 10               212
Qu·∫≠n 1                211
Qu·∫≠n 3                177
Huy·ªán H√≥c M√¥n         156
Qu·∫≠n 8                147
Huy·ªán B√¨nh Ch√°nh      111
Qu·∫≠n 5                 97
Qu·∫≠n 6                 91
Huy·ªán Nh√† B√®           76
Qu·∫≠n 11                70
Huy·ªán C·ªß Chi           62
Qu·∫≠n 4                 61
Huy·ªán C·∫ßn Gi·ªù          11
Name: count, dtype: int64
S·ªë d√≤ng v·∫´n c√≤n thi·∫øu District: 549

KI·ªÇM TRA M·∫™U SAU KHI FIX:
           District                                          Address
1190            NaN                              

In [96]:
# 1. L·∫•y danh s√°ch c√°c d√≤ng ƒë√£ ƒë∆∞·ª£c ch·ªânh s·ª≠a (District t·ª´ NaN th√†nh c√≥ gi√° tr·ªã)
# Gi·∫£ s·ª≠ mask_processed l√† nh·ªØng d√≤ng m√† tr∆∞·ªõc ƒë√≥ District b·ªã thi·∫øu
df_fixed = df_ready[df_ready['Address'].notna() & df_ready['District'].notna()].copy()

# 2. L·∫•y danh s√°ch c√°c d√≤ng v·∫´n ch∆∞a th·ªÉ thay ƒë·ªïi (V·∫´n l√† NaN)
df_remain_nan = df_ready[df_ready['District'].isna()].copy()

print("--- TH·ªêNG K√ä K·∫æT QU·∫¢ ---")
print(f"S·ªë d√≤ng ƒë√£ s·ª≠a ƒë√∫ng: {len(df_fixed)}")
print(f"S·ªë d√≤ng v·∫´n c√≤n tr·ªëng: {len(df_remain_nan)}")

# 3. Hi·ªÉn th·ªã b·∫£ng so s√°nh c√°c d√≤ng ƒë√£ s·ª≠a (Top 10 d√≤ng ƒë·∫ßu)
print("\n===== B·∫¢NG 1: C√ÅC D√íNG ƒê√É CH·ªàNH S·ª¨A ƒê√öNG (M·∫™U) =====")
if not df_fixed.empty:
    print(df_fixed[['Address', 'District']].head(10).to_markdown())
else:
    print("Kh√¥ng c√≥ d√≤ng n√†o ƒë∆∞·ª£c ch·ªânh s·ª≠a.")

# 4. Hi·ªÉn th·ªã b·∫£ng c√°c d√≤ng ch∆∞a s·ª≠a ƒë∆∞·ª£c (ƒê·ªÉ ki·ªÉm tra l√Ω do)
print("\n===== B·∫¢NG 2: C√ÅC D√íNG CH∆ØA TH·ªÇ THAY ƒê·ªîI (C·∫¶N KI·ªÇM TRA) =====")
if not df_remain_nan.empty:
    # Ch·ªâ l·∫•y c·ªôt Address ƒë·ªÉ soi l·ªói
    print(df_remain_nan[['Address']].head(10).to_markdown())
else:
    print("Tuy·ªát v·ªùi! Kh√¥ng c√≤n d√≤ng n√†o b·ªã thi·∫øu.")

--- TH·ªêNG K√ä K·∫æT QU·∫¢ ---
S·ªë d√≤ng ƒë√£ s·ª≠a ƒë√∫ng: 6022
S·ªë d√≤ng v·∫´n c√≤n tr·ªëng: 549

===== B·∫¢NG 1: C√ÅC D√íNG ƒê√É CH·ªàNH S·ª¨A ƒê√öNG (M·∫™U) =====
|    | Address                                                   | District          |
|---:|:----------------------------------------------------------|:------------------|
|  0 | Q. B√¨nh T√¢n, Tp. H·ªì Ch√≠ Minh                              | Qu·∫≠n B√¨nh T√¢n     |
|  1 | Q. 3, Tp. H·ªì Ch√≠ Minh                                     | Qu·∫≠n 3            |
|  2 | ƒê. ƒê·∫•t M·ªõi, Q. B√¨nh T√¢n, Tp. H·ªì Ch√≠ Minh                  | Qu·∫≠n B√¨nh T√¢n     |
|  3 | ƒê. Phan VƒÉn Tr·ªã, P. 12, Q. B√¨nh Th·∫°nh, Tp. H·ªì Ch√≠ Minh    | Qu·∫≠n B√¨nh Th·∫°nh   |
|  6 | ƒê. L√™ ƒê√¨nh Th√°m, P. T√¢n Qu√Ω, Q. T√¢n Ph√∫, Tp. H·ªì Ch√≠ Minh  | Qu·∫≠n T√¢n Ph√∫      |
|  7 | ƒê. Nguy·ªÖn H·ªØu C·∫£nh, P. 22, Q. B√¨nh Th·∫°nh, Tp. H·ªì Ch√≠ Minh | Qu·∫≠n B√¨nh Th·∫°nh   |
|  8 | ƒê. Nguy·ªÖn C∆∞ Trinh, P. C√¥ Giang, Q. 1, Tp. H

In [97]:
# 'utf-8-sig' gi√∫p Excel nh·∫≠n di·ªán ƒë√∫ng ti·∫øng Vi·ªát c√≥ d·∫•u
df_fixed.to_csv("../data/cleaned/cleanMey.csv", index=False, encoding='utf-8-sig')

In [107]:
df_fixed.head()

Unnamed: 0,id,Page,Title,Price_Raw,Price_Billion,Price_per_m2,Area_m2,District,Address,Bedrooms,...,Link,Description,M·∫∑t ti·ªÅn,Chi·ªÅu s√¢u,M·ª©c ƒë·ªô giao d·ªãch,∆Øu ƒëi·ªÉm BƒêS,H∆∞·ªõng ban c√¥ng,Ti·ªán √≠ch,N·ªôi th·∫•t,S·ªë ban c√¥ng
0,306038613,1,"Si√™u ph·∫©m nh√† m·∫∑t ti·ªÅn ƒë∆∞·ªùng S·ªë 36 T√™n L·ª≠a, s·ªü...","7,8 t·ª∑",7.8,162.5,48.0,Qu·∫≠n B√¨nh T√¢n,"Q. B√¨nh T√¢n, Tp. H·ªì Ch√≠ Minh",4.0,...,https://meeyland.com/ban-nha-mat-pho-binh-tan-...,"Si√™u ph·∫©m nh√† m·∫∑t ti·ªÅn ƒë∆∞·ªùng S·ªë 36 T√™n L·ª≠a, s·ªü...",4m,12m,C√†ng s·ªõm c√†ng t·ªët,Kinh doanh ƒë∆∞·ª£c,,,,
1,306041109,1,T·ªça l·∫°c t·∫°i v·ªã tr√≠ ƒë·∫Øc ƒë·ªãa tr√™n ƒë∆∞·ªùng C√°ch M·∫°n...,5 t·ª∑,5.0,148.809524,33.6,Qu·∫≠n 3,"Q. 3, Tp. H·ªì Ch√≠ Minh",3.0,...,https://meeyland.com/ban-nha-rieng-quan-3-ho-c...,T·ªça l·∫°c t·∫°i v·ªã tr√≠ ƒë·∫Øc ƒë·ªãa tr√™n ƒë∆∞·ªùng C√°ch M·∫°n...,"4,2m",8m,C√†ng s·ªõm c√†ng t·ªët,Giao th√¥ng thu·∫≠n l·ª£i,,,,
2,306041245,1,Nh√† 2 t·∫ßng t·ªça l·∫°c t·∫°i v·ªã tr√≠ ƒë·∫Øc ƒë·ªãa ngay c·∫°n...,"3,88 t·ª∑",3.88,71.851852,54.0,Qu·∫≠n B√¨nh T√¢n,"ƒê. ƒê·∫•t M·ªõi, Q. B√¨nh T√¢n, Tp. H·ªì Ch√≠ Minh",2.0,...,https://meeyland.com/ban-nha-rieng-binh-tan-ho...,T·ªça l·∫°c t·∫°i v·ªã tr√≠ ƒë·∫Øc ƒë·ªãa ngay c·∫°nh Ch·ª£ L√™ VƒÉ...,4m,"13,5m",C√†ng s·ªõm c√†ng t·ªët,"Kinh doanh ƒë∆∞·ª£c, Giao th√¥ng thu·∫≠n l·ª£i",,,,
3,104652329,1,"BaÃÅn cƒÉn h·ªô d·ªãch v·ª•, haÃ£ 1 tyÃâ g√¢ÃÄn Phan VƒÉn T...",21 t·ª∑,21.0,175.0,120.0,Qu·∫≠n B√¨nh Th·∫°nh,"ƒê. Phan VƒÉn Tr·ªã, P. 12, Q. B√¨nh Th·∫°nh, Tp. H·ªì ...",22.0,...,https://meeyland.com/ban-nha-mat-pho-binh-than...,"BaÃÅn cƒÉn h·ªô d·ªãch v·ª•, haÃ£ 1 tyÃâ g√¢ÃÄn Phan VƒÉn T...","4,7m","25,53m",C√†ng s·ªõm c√†ng t·ªët,"ƒêang c√≥ l·ª£i nhu·∫≠n cao, D√¢n tr√≠ cao, Giao th√¥ng...",,,,
6,104652322,1,"BaÃÅn nhaÃÄ m·∫∑t ti·ªÅn kinh doanh L√™ ƒêiÃÄnh ThaÃÅm, ...","9,3 t·ª∑",9.3,140.909091,66.0,Qu·∫≠n T√¢n Ph√∫,"ƒê. L√™ ƒê√¨nh Th√°m, P. T√¢n Qu√Ω, Q. T√¢n Ph√∫, Tp. H...",3.0,...,https://meeyland.com/ban-nha-mat-pho-tan-phu-h...,"BaÃÅn nhaÃÄ m·∫∑t ti·ªÅn kinh doanh L√™ ƒêiÃÄnh ThaÃÅm, ...","4,2m","15,71m",C√†ng s·ªõm c√†ng t·ªët,"Kinh doanh ƒë∆∞·ª£c, Cho thu√™ gi√° cao, D√¢n tr√≠ cao",,,,


In [108]:
inspect_csv(df_fixed)

===== CSV OVERVIEW =====
Rows    : 6022
Columns : 22

--- Data types ---
id                    int64
Page                  int64
Title                object
Price_Raw            object
Price_Billion       float64
Price_per_m2        float64
Area_m2             float64
District             object
Address              object
Bedrooms             object
Toilets              object
Post_Time            object
Link                 object
Description          object
M·∫∑t ti·ªÅn             object
Chi·ªÅu s√¢u            object
M·ª©c ƒë·ªô giao d·ªãch     object
∆Øu ƒëi·ªÉm BƒêS          object
H∆∞·ªõng ban c√¥ng       object
Ti·ªán √≠ch             object
N·ªôi th·∫•t             object
S·ªë ban c√¥ng         float64
dtype: object

--- Memory usage ---
14.91 MB


In [109]:
report_missing_values(df_fixed)

===== MISSING VALUE REPORT =====
                  Missing_Count  Percent_Missing (%)
S·ªë ban c√¥ng                6009                99.78
N·ªôi th·∫•t                   5997                99.58
H∆∞·ªõng ban c√¥ng             5861                97.33
Ti·ªán √≠ch                   5816                96.58
Chi·ªÅu s√¢u                  3720                61.77
∆Øu ƒëi·ªÉm BƒêS                2762                45.87
M·ª©c ƒë·ªô giao d·ªãch           2692                44.70
M·∫∑t ti·ªÅn                   2281                37.88
Price_per_m2                  0                 0.00
Price_Billion                 0                 0.00
Page                          0                 0.00
id                            0                 0.00
Price_Raw                     0                 0.00
Title                         0                 0.00
Description                   0                 0.00
Link                          0                 0.00
Post_Time                     0  

Unnamed: 0,Missing_Count,Percent_Missing (%)
S·ªë ban c√¥ng,6009,99.78
N·ªôi th·∫•t,5997,99.58
H∆∞·ªõng ban c√¥ng,5861,97.33
Ti·ªán √≠ch,5816,96.58
Chi·ªÅu s√¢u,3720,61.77
∆Øu ƒëi·ªÉm BƒêS,2762,45.87
M·ª©c ƒë·ªô giao d·ªãch,2692,44.7
M·∫∑t ti·ªÅn,2281,37.88
Price_per_m2,0,0.0
Price_Billion,0,0.0


In [110]:
# ==============================
# RE-CALCULATE PRICE PER M2
# ==============================

# ƒë·∫£m b·∫£o numeric
df_fixed['Price_Billion'] = pd.to_numeric(
    df_fixed['Price_Billion'], errors='coerce'
)

df_fixed['Area_m2'] = pd.to_numeric(
    df_fixed['Area_m2'], errors='coerce'
)

# lo·∫°i di·ªán t√≠ch kh√¥ng h·ª£p l·ªá
df_fixed.loc[df_fixed['Area_m2'] <= 0, 'Area_m2'] = np.nan

# t√≠nh gi√° / m2 (tri·ªáu / m2)
df_fixed['Price_per_m2'] = (
    df_fixed['Price_Billion'] * 1000
) / df_fixed['Area_m2']

# clip theo th·ªã tr∆∞·ªùng VN
df_fixed['Price_per_m2'] = df_fixed['Price_per_m2'].clip(
    lower=5,    # min h·ª£p l√Ω
    upper=500   # max h·ª£p l√Ω
)

# üîπ L√ÄM TR√íN 1 CH·ªÆ S·ªê TH·∫¨P PH√ÇN
df_fixed['Price_per_m2'] = df_fixed['Price_per_m2'].round(1)
print("DONE: Price_per_m2 recalculated")

DONE: Price_per_m2 recalculated


In [111]:
df_fixed[['Price_Billion', 'Area_m2', 'Price_per_m2']].describe()


Unnamed: 0,Price_Billion,Area_m2,Price_per_m2
count,6022.0,6022.0,6022.0
mean,15.023591,101.242012,144.494703
std,27.679838,103.944183,93.092337
min,0.3,10.005,5.0
25%,5.39,52.0,83.3
50%,8.5,72.0,126.2
75%,14.5,105.0,183.3
max,700.0,999.0,500.0


In [112]:
df_fixed[
    (df_fixed['Price_Billion'] > 0) &
    (df_fixed['Area_m2'] > 0) &
    (df_fixed['Price_per_m2'].isna())
].shape


(0, 22)

In [113]:
df_fixed.to_csv("../data/cleaned/df_fixed_final.csv",encoding="utf-8-sig", index=False)


In [114]:
report_missing_values(df_fixed)

===== MISSING VALUE REPORT =====
                  Missing_Count  Percent_Missing (%)
S·ªë ban c√¥ng                6009                99.78
N·ªôi th·∫•t                   5997                99.58
H∆∞·ªõng ban c√¥ng             5861                97.33
Ti·ªán √≠ch                   5816                96.58
Chi·ªÅu s√¢u                  3720                61.77
∆Øu ƒëi·ªÉm BƒêS                2762                45.87
M·ª©c ƒë·ªô giao d·ªãch           2692                44.70
M·∫∑t ti·ªÅn                   2281                37.88
Price_per_m2                  0                 0.00
Price_Billion                 0                 0.00
Page                          0                 0.00
id                            0                 0.00
Price_Raw                     0                 0.00
Title                         0                 0.00
Description                   0                 0.00
Link                          0                 0.00
Post_Time                     0  

Unnamed: 0,Missing_Count,Percent_Missing (%)
S·ªë ban c√¥ng,6009,99.78
N·ªôi th·∫•t,5997,99.58
H∆∞·ªõng ban c√¥ng,5861,97.33
Ti·ªán √≠ch,5816,96.58
Chi·ªÅu s√¢u,3720,61.77
∆Øu ƒëi·ªÉm BƒêS,2762,45.87
M·ª©c ƒë·ªô giao d·ªãch,2692,44.7
M·∫∑t ti·ªÅn,2281,37.88
Price_per_m2,0,0.0
Price_Billion,0,0.0


In [115]:
df_fixed['Post_Time'].head(10)
df_fixed['Post_Time'].dtype


dtype('O')

In [116]:
# ==============================
# POST TIME CLEANING
# ==============================

# strip + convert v·ªÅ string
df_fixed['Post_Time'] = df_fixed['Post_Time'].astype(str).str.strip()

# parse datetime (t·ª± nh·∫≠n d·∫°ng format)
df_fixed['Post_Time'] = pd.to_datetime(
    df_fixed['Post_Time'],
    errors='coerce',
    dayfirst=True   # QUAN TR·ªåNG cho d·ªØ li·ªáu VN
)

print("DONE: Post_Time standardized")

DONE: Post_Time standardized


In [117]:
missing_time = df_fixed['Post_Time'].isna().sum()
total = len(df_fixed)

print(f"Post_Time missing: {missing_time} / {total} ({missing_time/total*100:.2f}%)")


Post_Time missing: 0 / 6022 (0.00%)


In [118]:
df_fixed['Post_Year']  = df_fixed['Post_Time'].dt.year
df_fixed['Post_Month'] = df_fixed['Post_Time'].dt.month
df_fixed['Post_Day']   = df_fixed['Post_Time'].dt.day
df_fixed['Post_Weekday'] = df_fixed['Post_Time'].dt.weekday


In [119]:
df_fixed.to_csv(
    "../data/cleaned/df_fixed_final.csv",
    index=False,
    encoding="utf-8-sig"
)
