In [1]:
import joblib

model_path = "models/logreg_pipeline.pkl"  # adjust path if needed
pipeline = joblib.load(model_path)

print("Type:", type(pipeline))
print("Has predict_proba?", hasattr(pipeline, "predict_proba"))
print("Has classes_?", hasattr(pipeline, "classes_"))


Type: <class 'sklearn.pipeline.Pipeline'>
Has predict_proba? True
Has classes_? True


In [2]:
from pprint import pprint

# Load your model
import joblib
pipeline = joblib.load("models/logreg_pipeline.pkl")

# Show pipeline steps
pprint(pipeline.named_steps)


{'clf': LogisticRegression(max_iter=1000, random_state=42),
 'tfidf': TfidfVectorizer(max_features=5000, stop_words='english')}


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# ── 1) Reload & split data ────────────────────────────────────────────────
df = pd.read_csv("data/raw.csv").drop(columns=["Unnamed: 0"])
X = df["text"]
y = df["class"]

le        = LabelEncoder().fit(y)
y_encoded = le.transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2, random_state=42, stratify=y_encoded
)

# ── 2) Load your saved pipeline ─────────────────────────────────────────
pipeline = joblib.load("models/logreg_pipeline.pkl")

# ── 3) Predict & score ───────────────────────────────────────────────────
y_pred  = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]   # “1” is your suicide class

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC AUC:", roc_auc_score(y_test, y_proba))


Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.93      0.94      0.93     23208
     suicide       0.94      0.93      0.93     23207

    accuracy                           0.93     46415
   macro avg       0.93      0.93      0.93     46415
weighted avg       0.93      0.93      0.93     46415


Confusion Matrix:
[[21866  1342]
 [ 1722 21485]]

ROC AUC: 0.9809897585252059


In [4]:
!pip install shap




In [5]:
# ─── CELL: Setup all pipeline components ────────────────────────────────────────
import pandas as pd
import numpy as np
from empath import Empath
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin

# 1. Your existing preprocessing imports (adjust path as needed)
from src.preprocessing       import TextPreprocessingPipeline, contains_emoji, flag_urls, flag_mentions
from src.feature_engineering import build_features

# 2. enrich_df helper
def enrich_df(X):
    if isinstance(X, pd.DataFrame):
        texts = X["text"] if "text" in X.columns else X.iloc[:,0]
    elif hasattr(X, "ravel"):
        texts = pd.Series(X.ravel())
    else:
        texts = pd.Series(X)
    df_ = pd.DataFrame({"text": texts})
    df_["has_emoji"] = df_["text"].apply(contains_emoji)
    df_ = flag_urls(df_)
    df_ = flag_mentions(df_)
    df_ = TextPreprocessingPipeline(text_col="text").run(df_)
    df_ = build_features(df_)
    return df_

# 3. EmpathTransformer
class EmpathTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cats):
        self.categories = cats
        self.lexicon    = Empath()
    def fit(self, X, y=None): return self
    def transform(self, X):
        rows = []
        for txt in pd.Series(X).ravel():
            scores = self.lexicon.analyze(txt, normalize=True)
            rows.append([scores.get(c,0.0) for c in self.categories])
        return np.array(rows)

empath_cats = ["sadness","negative_emotion","loneliness","depression"]

# 4. Sub-pipelines
text_pipe = Pipeline([
    ("select_clean", FunctionTransformer(lambda df: df["clean_text"], validate=False)),
    ("tfidf",        TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5))
])

meta_cols = ["char_count","word_count","sentiment","has_url","has_emoji","has_reddit_mention"]
meta_pipe = Pipeline([
    ("select_meta", FunctionTransformer(lambda df: df[meta_cols], validate=False)),
    ("scale_meta",  StandardScaler())
])

empath_pipe = Pipeline([
    ("select_clean", FunctionTransformer(lambda df: df["clean_text"], validate=False)),
    ("empath",       EmpathTransformer(empath_cats)),
    ("scale_em",     StandardScaler())
])

# 5. Full enriched pipeline
proto_pipe = Pipeline([
    ("enrich", FunctionTransformer(enrich_df, validate=False)),
    ("union",  FeatureUnion([
        ("tfidf",   text_pipe),
        ("meta",    meta_pipe),
        ("empath",  empath_pipe)
    ])),
    ("clf",    LogisticRegression(class_weight="balanced", max_iter=1000))
])

print("✅ proto_pipe defined with steps:", [name for name,_ in proto_pipe.steps])


✅ proto_pipe defined with steps: ['enrich', 'union', 'clf']


In [7]:
import shap
import pandas as pd

# 1. Sample a background set from your train data
bg_df = X_train.sample(100, random_state=42).to_frame()

# 2. Helper to get numeric features for the classifier
def featurize(df):
    # 2a) Enrich (text→clean_text + meta-flags + counts + sentiment)
    enriched = proto_pipe.named_steps["enrich"].transform(df)
    # 2b) Run FeatureUnion (TF-IDF, meta, Empath) → returns array (n_samples × n_features)
    return proto_pipe.named_steps["union"].transform(enriched)

# 3. Build background feature matrix
bg_feats = featurize(bg_df)

# 4. Initialize the SHAP LinearExplainer for your LR step
explainer = shap.LinearExplainer(
    proto_pipe.named_steps["clf"],  # your LogisticRegression
    bg_feats,                       # background distribution
    feature_dependence="independent"
)

# 5. Pick a test instance to explain
inst_df    = X_test.sample(1, random_state=1).to_frame()
inst_feats = featurize(inst_df)

# 6. Compute SHAP values
#    For binary classification, shap_vals has shape (2, n_features).
shap_vals = explainer.shap_values(inst_feats)

# 7. Get feature names from your FeatureUnion
feat_names = proto_pipe.named_steps["union"].get_feature_names_out()

# 8. Visualize with a waterfall plot (class 1 = “suicide”)
shap.initjs()
shap.plots.waterfall(
    shap.Explanation(
        values        = shap_vals[1][0],         # class-1 shap values for sample 0
        base_values   = explainer.expected_value[1],
        data          = inst_feats[0],
        feature_names = feat_names
    )
)


NotFittedError: The TF-IDF vectorizer is not fitted

In [8]:
import joblib
pipeline = joblib.load("models/logreg_pipeline.pkl")
print(pipeline.named_steps.keys())


dict_keys(['tfidf', 'clf'])
