In [1]:
import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def concat_csv_with_prefix(folder_path: str, prefix: str) -> pd.DataFrame:
    """
    Reads all CSV files in *folder_path* that start with *prefix*,
    concatenates them into a single DataFrame, and returns it.

    Parameters
    ----------
    folder_path : str
        Path to the directory containing the CSV files.
    prefix : str
        Prefix that the target CSV filenames must start with.

    Returns
    -------
    pd.DataFrame
        Concatenated DataFrame containing all rows from the matched files.
    """
    # Build glob pattern: e.g. "data/reviews_*.csv"
    pattern = os.path.join(folder_path, f"{prefix}*.csv")
    csv_files = sorted(glob.glob(pattern))

    if not csv_files:
        raise FileNotFoundError(f"No CSV files found with prefix '{prefix}' in {folder_path}")

    # Read each file into a DataFrame
    df_list = [pd.read_csv(f) for f in csv_files]

    # Concatenate all DataFrames, resetting the index
    return pd.concat(df_list, ignore_index=True)


In [3]:
df = concat_csv_with_prefix("data", "review")
df.head()

Unnamed: 0,product_name,product_category,review_size,type,content,review,stars
0,Samsung W-3000,WASHING_MACHINE,Long,Positive,['Functionality'],I recently upgraded to the Samsung W-3000 and ...,5
1,Panasonic W-7000,WASHING_MACHINE,Small,Negative,['Delivery' 'Delivery' 'Functionality'],I was disappointed with the Panasonic W-7000 b...,1
2,Electrolux L3-400,REFRIGERATOR,Long,Positive,['Price' 'Product appearance'],I was pleasantly surprised by the Electrolux L...,5
3,Panasonic Q2-300,REFRIGERATOR,Small,Positive,['Functionality'],The Panasonic Q2-300 keeps my groceries fresh ...,5
4,Pottery Barn S-2100,SOFA,Long,Negative,['Price' 'Functionality'],I was excited to add the Pottery Barn S-2100 t...,1


In [4]:
df = df[['review', 'type']]

In [5]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['type']
)

In [6]:
predictor = TabularPredictor(label="type").fit(df_train)

No path specified. Models will be saved in: "AutogluonModels/ag-20260131_224624"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.12.3
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.9.1+cu128
CUDA Version:       12.8
GPU Memory:         GPU 0: 15.92/15.92 GB
Total GPU Memory:   Free: 15.92 GB, Allocated: 0.00 GB, Total: 15.92 GB
GPU Count:          1
Memory Avail:       17.61 GB / 23.47 GB (75.0%)
Disk Space Avail:   839.08 GB / 1006.85 GB (83.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme'  : New in v1.5: The state-of-the-art for tabular data. Massively better than 'best' on datasets <100000 sa

In [7]:
df_predicted = predictor.predict(df_test.drop(columns=["type"]))

In [8]:
df_predicted.head()

1707    Negative
2189    Positive
789     Negative
140     Positive
775     Positive
Name: type, dtype: object

In [9]:
df_test.head()["type"]

1707    Negative
2189    Positive
789     Negative
140     Positive
775     Positive
Name: type, dtype: object

In [10]:
predictor.leaderboard(df_test)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesEntr,0.991111,0.997222,accuracy,0.121642,0.111867,0.747021,0.121642,0.111867,0.747021,1,True,7
1,ExtraTreesGini,0.988889,0.994444,accuracy,0.115358,0.111237,0.955261,0.115358,0.111237,0.955261,1,True,6
2,CatBoost,0.986667,0.997222,accuracy,0.019312,0.016137,1.822124,0.019312,0.016137,1.822124,1,True,5
3,RandomForestGini,0.986667,0.991667,accuracy,0.11862,0.109818,0.800567,0.11862,0.109818,0.800567,1,True,3
4,RandomForestEntr,0.986667,0.994444,accuracy,0.121053,0.110705,0.730436,0.121053,0.110705,0.730436,1,True,4
5,LightGBM,0.982222,0.997222,accuracy,0.006859,0.004838,0.539779,0.006859,0.004838,0.539779,1,True,2
6,WeightedEnsemble_L2,0.982222,0.997222,accuracy,0.008492,0.005161,0.569671,0.001633,0.000323,0.029892,2,True,12
7,LightGBMXT,0.982222,0.994444,accuracy,0.010037,0.004478,0.701447,0.010037,0.004478,0.701447,1,True,1
8,XGBoost,0.971111,0.986111,accuracy,0.013737,0.00412,1.165107,0.013737,0.00412,1.165107,1,True,9
9,LightGBMLarge,0.964444,0.977778,accuracy,0.013343,0.004316,1.131653,0.013343,0.004316,1.131653,1,True,11


In [11]:
print(predictor.feature_metadata)

('category', ['text_as_category'])  :    1 | ['review']
('int', ['binned', 'text_special']) :   20 | ['review.char_count', 'review.word_count', 'review.capital_ratio', 'review.lower_ratio', 'review.digit_ratio', ...]
('int', ['text_ngram'])             : 1159 | ['__nlp__.about', '__nlp__.about the', '__nlp__.absolute', '__nlp__.absolute nightmare', '__nlp__.absolutely', ...]


In [12]:
predictor.feature_importance(df_test)

Computing feature importance via permutation shuffling for 1 features using 450 rows with 5 shuffle sets...
	0.52s	= Expected runtime (0.1s per shuffle set)
	0.33s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
review,0.48,0.009027,1.500132e-08,5,0.498586,0.461414


In [13]:
predictor.evaluate(df_test)

{'accuracy': 0.9822222222222222,
 'balanced_accuracy': np.float64(0.9822783309625416),
 'mcc': 0.9644804455240376,
 'roc_auc': np.float64(0.9986170380907222),
 'f1': 0.9820627802690582,
 'precision': 0.9776785714285714,
 'recall': 0.9864864864864865}

# Path of the best model

In [16]:
os.path.join(predictor.path, predictor.predictor_file_name)

'/home/jef/repos/rating-evaluation/AutogluonModels/ag-20260131_224624/predictor.pkl'