In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

In [2]:
df_gemini = pd.read_csv('../filled_data/df_gemini_2_5_flash_v2.csv')
df_llama = pd.read_csv('../filled_data/df_llama3_3_v2.csv')
df_manual = pd.read_csv('../filled_data/df_manual_filled.csv')

In [3]:
df_gemini['cpu_model'] = df_gemini['cpu_model'].replace('apple m1', 'm1')
df_gemini['cpu_model'] = df_gemini['cpu_model'].replace('apple m2', 'm2')
df_gemini['cpu_model'] = df_gemini['cpu_model'].replace('apple m3', 'm3')
df_gemini['cpu_model'] = df_gemini['cpu_model'].replace('apple m4', 'm4')
df_llama['cpu_model'] = df_llama['cpu_model'].replace('apple m1', 'm1')
df_llama['cpu_model'] = df_llama['cpu_model'].replace('apple m2', 'm2')
df_llama['cpu_model'] = df_llama['cpu_model'].replace('apple m3', 'm3')
df_llama['cpu_model'] = df_llama['cpu_model'].replace('apple m4', 'm4')
df_manual['cpu_model'] = df_manual['cpu_model'].replace('apple m1', 'm1')
df_manual['cpu_model'] = df_manual['cpu_model'].replace('apple m2', 'm2')
df_manual['cpu_model'] = df_manual['cpu_model'].replace('apple m3', 'm3')
df_manual['cpu_model'] = df_manual['cpu_model'].replace('apple m4', 'm4')

In [4]:
df_gemini.isnull().any(axis=1).sum()


np.int64(912)

In [5]:
df_gemini.loc[df_gemini['vga_type'] == 'card tích hợp', 'vga_vram_gb'] = 0


In [6]:
df_gemini.isnull().any(axis=1).sum()


np.int64(280)

In [57]:
df_gemini.to_csv('../../output/df_gemini.csv', index=False)

In [7]:
df_llama.isnull().any(axis=1).sum()

np.int64(320)

In [8]:
df_manual.isnull().any(axis=1).sum()

np.int64(0)

Tỉ lệ fill của 2 llms sau 2 lần chạy
- Gemini: (1225 - 280)/1225 * 100. round(2) = 77.14%
- Llama: (1225-320)/ 1225 * 100.round(2) = 73.87%

# Take 20% of filled data to check for accuracy

In [9]:
# Drop every row that have at least one value is NaN
df_llama_clean = df_llama.dropna()
df_gemini_clean = df_gemini.dropna()

# Drop every row that contains "hãng không công bố" hoặc "hãng khác"
def is_valid (row):
    return ~row.astype(str).str.contains("hãng không công bố|hãng khác", case=False).any()

df_gemini_clean = df_gemini_clean[df_gemini_clean.apply(is_valid, axis=1)]
df_llama_clean = df_llama_clean[df_llama_clean.apply(is_valid, axis=1)]

# Fetch the similar index in both files
common_index = df_gemini_clean.index.intersection(df_llama_clean.index)
print(common_index.shape)

# Select 245 (20%) row in common index
selected_index = common_index[:245]
df_gemini_selected = df_gemini_clean.loc[selected_index]
df_llama_selected = df_llama_clean.loc[selected_index]

(837,)


In [10]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)


In [21]:
# Categorize columns and set up threshold
outsiders = ['product_id', 'name', 'manufacturer']
categorical_cols = ['ram_type', 'cpu_model', 'vga_type', 'laptop_camera']
numerical_cols = ['root_price_vnd', 'discounted_price_vnd', 'battery_capacity_wh',
                'cpu_threads', 'cpu_base_clock_ghz', 'laptop_height_mm',
                'laptop_width_mm', 'cpu_boost_clock_ghz', 'cpu_cores',
                'ram_speed', 'refresh_rate_hz', 'ram_slots',
                'laptop_depth_mm', 'vga_vram_gb']
price_cols =  ['root_price_vnd', 'discounted_price_vnd']
PRICE_TOLERANCE = 5000000

In [27]:
# Prepare DataFrame
df_manual = df_manual.set_index('product_id', drop=False)
df_gemini_selected = df_gemini.set_index('product_id', drop=False)
df_llama_selected = df_llama.set_index('product_id', drop=False)

In [49]:
def evaluate_fill_performance(df_llm, df_true, cat_cols, num_cols, price_cols, price_tolerance, llm_name):
    result = {}
    print(f"Evaluation for filled data by {llm_name}")
    # Align DataFrame
    df_true_aligned, df_llm_alligned = df_true.align(df_llm, join='inner', axis=0)
    if len(df_true_aligned) == 0:
        print(f"No matching rows between df_ true and df_llm {llm_name}")
        return result
    if len(df_true_aligned) < len(df_true):
        print(f"Only {len(df_true_aligned)}/{len(df_true)} are aligned")
        
    
    # Evaluate categorical features
    cat_accuracies = {}
    for col in cat_cols:
        if col in df_llm_alligned.columns and col in df_true_aligned.columns:
            true_values = df_true_aligned[col].astype(str).fillna("MISSING_MANUAL")
            llm_values = df_llm_alligned[col].astype(str).fillna(f"MISSING_{llm_name.upper()}")
            acc =  accuracy_score(true_values, llm_values)
            cat_accuracies[col] = acc
            print(f"Accuracy for {col}: {acc:.4f}")
        else:
            print(f"Col is not in both 2 DataFrame")
        print("-" *50)
    result[f"{llm_name}_categorical_accuracy"] = cat_accuracies
    if cat_accuracies:
        avg_cat_acc =  np.mean(list(cat_accuracies.values()))
        result[f"{llm_name}_avg_categorical_accuracy"] =  avg_cat_acc
        print(f"Average accuracy for categorical columns is: {avg_cat_acc:.4f}")
    
    # Evaluate numerical cols
    num_metrics = {}
    for col in num_cols:    
        if col in df_llm_alligned.columns and col in df_true_aligned.columns:
            print("Raw true values:", df_true_aligned[col].head(10).tolist())
            print("Raw LLM values:", df_llm_alligned[col].head(10).tolist())
            true_values_num = pd.to_numeric(df_true_aligned[col], errors='coerce')
            llm_values_num = pd.to_numeric(df_llm_alligned[col], errors='coerce')
            print("Numeric true values:", true_values_num.head(10).tolist())
            print("Numeric LLM values:", llm_values_num.head(10).tolist())
            valid_mask = ~true_values_num.isnull() & ~llm_values_num.isnull()
            true_valid = true_values_num[valid_mask]
            llm_valid = llm_values_num[valid_mask]
            
            if len(true_valid) > 0:
                mae = mean_absolute_error(true_valid, llm_valid)
                rmse = np.sqrt(mean_squared_error(true_valid, llm_valid))
                current_col_metrics = {'MAE': mae, 
                                       'RMSE': rmse,
                                       'N_valid': len(llm_valid)}
                print(f"Column name: {col}")
                print(f"MAE: {mae:.6f}")
                print(f"RMSE: {rmse:.6f}")
                print(f"Number of valid value: {len(llm_valid)}")
                if col in price_cols:
                    acc_with_tol = np.mean(np.abs(true_valid - llm_valid <= PRICE_TOLERANCE))
                    current_col_metrics['Accuracy with tolerance'] = acc_with_tol
                    current_col_metrics['Tolerance value'] = PRICE_TOLERANCE
                    print(f"Accuracy with tolerance {PRICE_TOLERANCE} VND: {acc_with_tol:.4f}")
                    num_metrics[col] = current_col_metrics
            else:
                print(f"{col} doesn't have valid value to evaluate ")
                num_metrics[col] = {'MAE': np.nan, 
                                    'RMSE': np.nan,
                                    'N_valid': 0}
            print("-" *50)
        else:
            print(f"{col} is not in both aligned DataFrame")
    result[f"{llm_name}_numerical_metrics"] = num_metrics
    return result
            

In [50]:
result_gemini = evaluate_fill_performance(df_gemini_selected, df_manual, categorical_cols, numerical_cols, price_cols, PRICE_TOLERANCE, "Gemini")


Evaluation for filled data by Gemini
Accuracy for ram_type: 0.9714
--------------------------------------------------
Accuracy for cpu_model: 0.9388
--------------------------------------------------
Accuracy for vga_type: 1.0000
--------------------------------------------------
Accuracy for laptop_camera: 1.0000
--------------------------------------------------
Average accuracy for categorical columns is: 0.9776
Raw true values: [13090000, 12290000, 19990000, 15490000, 18590000, 15490000, 25290000, 19790000, 15490000, 15990000]
Raw LLM values: [14990000.0, 12290000.0, 19990000.0, 17990000.0, 15990000.0, 15490000.0, 25290000.0, 19790000.0, 15490000.0, 15990000.0]
Numeric true values: [13090000, 12290000, 19990000, 15490000, 18590000, 15490000, 25290000, 19790000, 15490000, 15990000]
Numeric LLM values: [14990000.0, 12290000.0, 19990000.0, 17990000.0, 15990000.0, 15490000.0, 25290000.0, 19790000.0, 15490000.0, 15990000.0]
Column name: root_price_vnd
MAE: 927665.306122
RMSE: 3317975.34

In [40]:
result_llama = evaluate_fill_performance(df_llama_selected, df_manual, categorical_cols, numerical_cols, price_cols, PRICE_TOLERANCE, "Llama")

Evaluation for filled data by Llama
Accuracy for ram_type: 0.9918
Accuracy for cpu_model: 1.0000
Accuracy for vga_type: 1.0000
Accuracy for laptop_camera: 1.0000
Average accuracy for categorical columns is: 0.9980
Column name: root_price_vnd
MAE: 1033024.49
RMSE: 3239172.96
Number of valid value: 245
Accuracy with tolerance 5000000 VND: 0.9592
Column name: discounted_price_vnd
MAE: 688751.02
RMSE: 2757493.80
Number of valid value: 245
Accuracy with tolerance 5000000 VND: 0.9551
Column name: battery_capacity_wh
MAE: 0.17
RMSE: 1.92
Number of valid value: 245
Column name: cpu_threads
MAE: 0.18
RMSE: 0.87
Number of valid value: 245
Column name: cpu_base_clock_ghz
MAE: 0.08
RMSE: 0.32
Number of valid value: 245
Column name: laptop_height_mm
MAE: 0.00
RMSE: 0.00
Number of valid value: 245
Column name: laptop_width_mm
MAE: 0.00
RMSE: 0.00
Number of valid value: 245
Column name: cpu_boost_clock_ghz
MAE: 0.00
RMSE: 0.00
Number of valid value: 245
Column name: cpu_cores
MAE: 0.00
RMSE: 0.00
Num

In [51]:
print("Average accuracy for each columns: ")
gemini_avg_cat_acc = result_gemini.get("Gemini_avg_categorical_accuracy", np.nan)
llama_avg_cat_acc = result_llama.get("Llama_avg_categorical_accuracy", np.nan)
print(f"Gemini: {gemini_avg_cat_acc:.4f}" if not np.isnan(gemini_avg_cat_acc) else "Gemini: N/A")
print(f"Llama: {llama_avg_cat_acc:.4f}" if not np.isnan(llama_avg_cat_acc) else "Llama: N/A")
print("="*50)
print("Accuracy in details of categorical columns")
for col in categorical_cols:
    acc_g = result_gemini.get('Gemini_categorical_accuracy',{}).get(col, np.nan)
    acc_l = result_llama.get('Llama_categorical_accuracy',{}).get(col, np.nan)
    print(f"{col}")
    print(f"Gemini: {acc_g:.4f}" if not np.isnan(acc_g) else "Gemini: N/A")
    print(f"Llama: {acc_l:.4f}" if not np.isnan(acc_l) else "Llama: N/A")
    print("-" *50)


print("="*50)
print("Accuracy in details of numerical columns")
for col in numerical_cols:
    metrics_g = result_gemini.get('Gemini_numerical_metrics', {}).get(col, {})
    metrics_l = result_llama.get("Llama_numerical_metrics", {}).get(col, {})

    print(f"{col}")
    # --- Gemini ---
    mae_g = metrics_g.get("MAE", np.nan)
    rmse_g = metrics_g.get("RMSE", np.nan)
    acc_tol_g = metrics_g.get('Accuracy with tolerance', np.nan)
    tol_val_g = metrics_g.get('Tolerance value', PRICE_TOLERANCE)
    
    # --- LLaMA ---
    mae_l = metrics_l.get('MAE', np.nan)
    rmse_l = metrics_l.get('RMSE', np.nan)
    acc_tol_l = metrics_l.get('Accuracy with tolerance', np.nan)
    tol_val_l = metrics_l.get('Tolerance value', PRICE_TOLERANCE)
    
    # --- Gemini ---
    print(f"Gemini - MAE: {mae_g:,.2f}, RMSE: {rmse_g:,.2f}" if not (np.isnan(mae_g) or np.isnan(rmse_g)) else "Gemini - MAE: N/A, RMSE: N/A", end="")    
    if col in price_cols:
        print(f", Acc w/ Tol ({tol_val_g:,.0f}): {acc_tol_g:.4f}" if not np.isnan(acc_tol_g) else f", Acc w/ Tol ({tol_val_g:,.0f}): N/A")
    else:
        print() # Xuống dòng nếu không phải cột giá

    # --- LLaMA ---
    print(f"Llama  - MAE: {mae_l:,.2f}, RMSE: {rmse_l:,.2f}" if not (np.isnan(mae_l) or np.isnan(rmse_l)) else "Llama  - MAE: N/A, RMSE: N/A", end="")
    if col in price_cols:
        print(f", Acc w/ Tol ({tol_val_l:,.0f}): {acc_tol_l:.4f}" if not np.isnan(acc_tol_l) else f", Acc w/ Tol ({tol_val_l:,.0f}): N/A")
    else:
        print()
    print("-" *50)


    
    


Average accuracy for each columns: 
Gemini: 0.9776
Llama: 0.9980
Accuracy in details of categorical columns
ram_type
Gemini: 0.9714
Llama: 0.9918
--------------------------------------------------
cpu_model
Gemini: 0.9388
Llama: 1.0000
--------------------------------------------------
vga_type
Gemini: 1.0000
Llama: 1.0000
--------------------------------------------------
laptop_camera
Gemini: 1.0000
Llama: 1.0000
--------------------------------------------------
Accuracy in details of numerical columns
root_price_vnd
Gemini - MAE: 927,665.31, RMSE: 3,317,975.34, Acc w/ Tol (5,000,000): 0.9837
Llama  - MAE: 1,033,024.49, RMSE: 3,239,172.96, Acc w/ Tol (5,000,000): 0.9592
--------------------------------------------------
discounted_price_vnd
Gemini - MAE: 545,563.27, RMSE: 2,283,305.56, Acc w/ Tol (5,000,000): 0.9837
Llama  - MAE: 688,751.02, RMSE: 2,757,493.80, Acc w/ Tol (5,000,000): 0.9551
--------------------------------------------------
battery_capacity_wh
Gemini - MAE: N/A, RM