In [None]:
import os
import pandas as pd
import numpy as np
import boto3
import pyarrow as pa
import pyarrow.parquet as pq

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
DATA_DIR = os.getenv("DATA_DIR", "/tmp/aai540")
local_path = os.path.join(DATA_DIR, "engineered_data_sample.parquet")
bucket = os.getenv("BUCKET_NAME")
s3_path = f"s3://{bucket}/curated/backblaze_parquet/" if bucket else None

print("Local path:", local_path)
print("S3 path:", s3_path)


In [None]:
df = None
source = None

if s3_path:
    try:
        df = pd.read_parquet(s3_path)
        source = s3_path
    except Exception as e:
        print(f"S3 read failed ({type(e).__name__}). Falling back to local file.")

if df is None:
    df = pd.read_parquet(local_path)
    source = local_path

print("Loaded from:", source)
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True
2,5.0,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,1589665266052,2,True
3,1.0,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1643393630220,0,True
4,5.0,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1609322563534,0,True


In [None]:
print("Rows, cols:", df.shape)
print("Columns sample:", list(df.columns)[:20])
if "failure" in df.columns:
    print("Failure rate:", df["failure"].mean())


Saved: /tmp/aai540/all_beauty_reviews.parquet


In [None]:
TARGET = "failure"
if TARGET not in df.columns:
    raise ValueError("Expected 'failure' column not found in Backblaze data.")

X = df.drop(columns=[TARGET, "serial_number", "date"], errors="ignore")
y = df[TARGET]

categorical_cols = X.select_dtypes(include=["object", "category"]).columns
numeric_cols = X.columns.difference(categorical_cols)

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)




AWS credentials not found. Skipping S3 upload.
Set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY (and AWS_SESSION_TOKEN if needed) and rerun this cell.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", LogisticRegression(max_iter=1000, class_weight="balanced")),
    ]
)
model.fit(X_train, y_train)


S3 read failed (OSError). Falling back to local file.
Loaded from local: /tmp/aai540/all_beauty_reviews.parquet


In [None]:
y_pred = model.predict(X_test)


In [None]:
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))


In [None]:
df[[TARGET]].head()


              precision    recall  f1-score   support

           0       0.87      1.00      0.93       174
           1       0.00      0.00      0.00        26

    accuracy                           0.87       200
   macro avg       0.43      0.50      0.47       200
weighted avg       0.76      0.87      0.81       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
