In [46]:
import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

In [47]:
def concat_csv_with_prefix(folder_path: str, prefix: str) -> pd.DataFrame:
    """
    Reads all CSV files in *folder_path* that start with *prefix*,
    concatenates them into a single DataFrame, and returns it.

    Parameters
    ----------
    folder_path : str
        Path to the directory containing the CSV files.
    prefix : str
        Prefix that the target CSV filenames must start with.

    Returns
    -------
    pd.DataFrame
        Concatenated DataFrame containing all rows from the matched files.
    """
    # Build glob pattern: e.g. "data/reviews_*.csv"
    pattern = os.path.join(folder_path, f"{prefix}*.csv")
    csv_files = sorted(glob.glob(pattern))

    if not csv_files:
        raise FileNotFoundError(f"No CSV files found with prefix '{prefix}' in {folder_path}")

    # Read each file into a DataFrame
    df_list = [pd.read_csv(f) for f in csv_files]

    # Concatenate all DataFrames, resetting the index
    return pd.concat(df_list, ignore_index=True)


In [48]:
df = concat_csv_with_prefix("data", "review")
df.head()

Unnamed: 0,product_name,product_category,review_size,type,content,review,stars
0,Samsung W-3000,WASHING_MACHINE,Long,Positive,['Functionality'],"I recently upgraded to the Samsung W-3000 and I have to say, its functionality is top‑notch.\nThe load‑sensing technology automatically adjusts water usage, which has saved me a surprising amount on my monthly bills.\nI love how the machine offers a wide range of wash cycles, from quick 15‑minute cleans to deep‑tissue steam options, so I can tackle any fabric type with confidence.\nThe quiet motor is almost whisper‑quiet, even during the spin cycle, which is a huge plus for our open‑plan living space.\nFinally, the intuitive touchscreen interface makes it a breeze to set up custom wash pro...",5
1,Panasonic W-7000,WASHING_MACHINE,Small,Negative,['Delivery' 'Delivery' 'Functionality'],I was disappointed with the Panasonic W-7000 because the delivery was delayed and the machine stopped working after just a week.,1
2,Electrolux L3-400,REFRIGERATOR,Long,Positive,['Price' 'Product appearance'],"I was pleasantly surprised by the Electrolux L3‑400’s sleek design – the stainless steel finish looks like it belongs in a modern kitchen, and the minimalist door handles give it a very premium feel. The price point is surprisingly reasonable for a unit of this size and quality; I felt like I was getting a lot of value for my money. Inside, the spacious compartments and adjustable shelves make it easy to organize everything from groceries to large containers. The quiet operation is a bonus, especially when the fridge is in the living area. Overall, I’m thrilled with both the look and t...",5
3,Panasonic Q2-300,REFRIGERATOR,Small,Positive,['Functionality'],"The Panasonic Q2-300 keeps my groceries fresh and the temperature control is spot‑on, making it a standout in terms of functionality.",5
4,Pottery Barn S-2100,SOFA,Long,Negative,['Price' 'Functionality'],"I was excited to add the Pottery Barn S-2100 to my living room, but it turned out to be a major disappointment.\nThe price tag alone felt like a stretch for what the sofa offers—it's far more expensive than comparable models from other brands.\nFunctionally, the cushions sag quickly, and the back support is almost nonexistent, making it uncomfortable after a short sit.\nThe fabric also shows stains after just a few weeks of use, which is unacceptable for a piece marketed as premium.\nOverall, I regret the purchase and would not recommend this sofa to anyone on a budget or looking for durab...",1


In [49]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['type']
)

In [50]:
predictor = TabularPredictor(label="type").fit(df_train)

No path specified. Models will be saved in: "AutogluonModels/ag-20260131_005049"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.12.3
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
CPU Count:          12
Pytorch Version:    2.9.1+cu128
CUDA Version:       12.8
GPU Memory:         GPU 0: 15.92/15.92 GB
Total GPU Memory:   Free: 15.92 GB, Allocated: 0.00 GB, Total: 15.92 GB
GPU Count:          1
Memory Avail:       17.70 GB / 23.47 GB (75.4%)
Disk Space Avail:   839.33 GB / 1006.85 GB (83.4%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme'  : New in v1.5: The state-of-the-art for tabular data. Massively better than 'best' on datasets <100000 sa

In [51]:
df_predicted = predictor.predict(df_test.drop(columns=["type"]))

In [52]:
df_predicted.head()

1707    Negative
2189    Positive
789     Negative
140     Positive
775     Positive
Name: type, dtype: object

In [53]:
df_test.head()["type"]

1707    Negative
2189    Positive
789     Negative
140     Positive
775     Positive
Name: type, dtype: object

In [54]:
predictor.leaderboard(df_test)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,1.0,1.0,accuracy,0.010309,0.004686,0.640777,0.010309,0.004686,0.640777,1,True,9
1,NeuralNetFastAI,1.0,1.0,accuracy,0.011504,0.006367,0.861364,0.011504,0.006367,0.861364,1,True,8
2,WeightedEnsemble_L2,1.0,1.0,accuracy,0.012513,0.006748,0.886524,0.001009,0.000381,0.02516,2,True,12
3,ExtraTreesGini,1.0,1.0,accuracy,0.112879,0.109317,0.819257,0.112879,0.109317,0.819257,1,True,6
4,RandomForestEntr,1.0,1.0,accuracy,0.116036,0.110259,0.741213,0.116036,0.110259,0.741213,1,True,4
5,ExtraTreesEntr,1.0,1.0,accuracy,0.116723,0.111848,0.742081,0.116723,0.111848,0.742081,1,True,7
6,RandomForestGini,1.0,1.0,accuracy,0.117492,0.111526,0.754551,0.117492,0.111526,0.754551,1,True,3
7,LightGBMLarge,0.997778,1.0,accuracy,0.004012,0.002876,1.077009,0.004012,0.002876,1.077009,1,True,11
8,LightGBMXT,0.997778,1.0,accuracy,0.004064,0.004075,0.702559,0.004064,0.004075,0.702559,1,True,1
9,LightGBM,0.997778,1.0,accuracy,0.004419,0.003048,0.533757,0.004419,0.003048,0.533757,1,True,2


In [55]:
predictor.path

'/home/jef/repos/rating-evaluation/AutogluonModels/ag-20260131_005049'

In [56]:
predictor.plot_ensemble_model()

'/home/jef/repos/rating-evaluation/AutogluonModels/ag-20260131_005049/ensemble_model.png'

In [57]:
predictor.problem_type

'binary'

In [58]:
print(predictor.feature_metadata)

('category', [])                    :    3 | ['product_name', 'product_category', 'review_size']
('category', ['text_as_category'])  :    2 | ['content', 'review']
('int', [])                         :    1 | ['stars']
('int', ['binned', 'text_special']) :   26 | ['content.char_count', 'content.word_count', 'content.capital_ratio', 'content.lower_ratio', 'content.special_ratio', ...]
('int', ['text_ngram'])             : 1206 | ['__nlp__.about', '__nlp__.about the', '__nlp__.absolute', '__nlp__.absolute nightmare', '__nlp__.absolutely', ...]


In [59]:
predictor.transform_features(df_test).head()

Unnamed: 0,stars,product_name,product_category,review_size,content,review,content.char_count,content.word_count,content.capital_ratio,content.lower_ratio,...,__nlp__.worth the,__nlp__.would,__nlp__.would be,__nlp__.would not,__nlp__.wouldn,__nlp__.yet,__nlp__.you,__nlp__.you get,__nlp__.you re,__nlp__._total_
1707,2,64,5,2,10,,5,2,2,6,...,0,0,0,0,0,0,0,0,0,52
2189,4,6,3,1,32,,0,0,9,0,...,0,0,0,0,0,0,0,0,0,69
789,2,91,7,0,21,,1,0,1,4,...,0,0,0,0,0,0,0,0,0,28
140,5,47,0,1,24,,2,1,8,0,...,0,0,0,0,0,0,0,0,0,49
775,5,64,5,1,22,,5,2,7,2,...,0,0,0,0,0,0,0,0,0,68


In [60]:
predictor.feature_importance(df_test)

Computing feature importance via permutation shuffling for 6 features using 450 rows with 5 shuffle sets...
	2.08s	= Expected runtime (0.42s per shuffle set)
	1.32s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
stars,0.494222,0.010135,2.120925e-08,5,0.51509,0.473354
review,0.004,0.000994,0.0004219163,5,0.006046,0.001954
content,0.000889,0.001217,0.0889039,5,0.003395,-0.001617
product_name,0.0,0.0,0.5,5,0.0,0.0
product_category,0.0,0.0,0.5,5,0.0,0.0
review_size,0.0,0.0,0.5,5,0.0,0.0


In [61]:
predictor.model_best

'WeightedEnsemble_L2'

In [62]:
predictor.evaluate(df_test)

{'accuracy': 1.0,
 'balanced_accuracy': np.float64(1.0),
 'mcc': 1.0,
 'roc_auc': np.float64(1.0),
 'f1': 1.0,
 'precision': 1.0,
 'recall': 1.0}