In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import joblib, json, os


# Paths
EMB = 'outputs/image_embeddings_clean.csv'
LABELS = 'outputs/moderation_labels.csv'
OUT_JSON = 'outputs/moderation.json'
MODEL_OUT = 'models/moderation_model_improved.pkl'


os.makedirs('models', exist_ok=True)


# Cell 2: load data
emb = pd.read_csv(EMB)
labels = pd.read_csv(LABELS)
df = emb.merge(labels, on='filename', how='left')
df['label'] = df['label'].fillna(0).astype(int)
X = df.drop(columns=['filename','label']).values
y = df['label'].values


# Cell 3: preprocessing
scaler = StandardScaler()
X_s = scaler.fit_transform(X)
# PCA for stability
n_comp = min(100, X_s.shape[1], X_s.shape[0]-1)
pca = PCA(n_components=n_comp, random_state=42)
X_p = pca.fit_transform(X_s)


# Cell 4: train/test split
Xtr, Xte, ytr, yte, fn_tr, fn_te = train_test_split(X_p, y, df['filename'], test_size=0.2, random_state=42, stratify=y)


# Cell 5: train models
clf = LogisticRegression(max_iter=5000, class_weight='balanced', solver='saga')
clf.fit(Xtr, ytr)
rfc = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42, n_jobs=-1)
rfc.fit(Xtr, ytr)


# Cell 6: evaluate
proba_lr = clf.predict_proba(Xte)[:,1]
proba_rf = rfc.predict_proba(Xte)[:,1]
auc_lr = roc_auc_score(yte, proba_lr) if len(set(yte))>1 else None
auc_rf = roc_auc_score(yte, proba_rf) if len(set(yte))>1 else None
print('AUC LR:', auc_lr, 'AUC RF:', auc_rf)
from sklearn.metrics import classification_report
print('LR report:\n', classification_report(yte, (proba_lr>=0.5).astype(int)))
print('RF report:\n', classification_report(yte, (proba_rf>=0.5).astype(int)))


# Cell 7: choose and save best model (by AUC)
chosen = clf if (auc_lr or 0) >= (auc_rf or 0) else rfc
joblib.dump({'model': chosen, 'scaler': scaler, 'pca': pca}, MODEL_OUT)
print('Saved model to', MODEL_OUT)


# Cell 8: produce outputs/moderation.json for all files
probs = chosen.predict_proba(pca.transform(scaler.transform(X)))[:,1]
preds = (probs >= 0.5).astype(int)
out = {fname: {'moderation': 'unsafe' if int(p)==1 else 'safe', 'prob_unsafe': float(pr)} for fname,p,pr in zip(df['filename'], preds, probs)}
with open(OUT_JSON,'w') as f:
json.dump(out, f, indent=2)
print('Saved', OUT_JSON)


# Cell 9: show some worst/best examples (optional)
import matplotlib.pyplot as plt
from PIL import Image
sample_unsafe = [k for k,v in out.items() if v['moderation']=='unsafe'][:8]
print('Sample unsafe:', sample_unsafe)
for fn in sample_unsafe[:8]:
img = Image.open('datasets/images/' + fn)
plt.figure(figsize=(3,3)); plt.imshow(img); plt.axis('off')