In [1]:
###https://www.opendatabay.com/data/healthcare/9182606a-0bc1-4cb5-8700-73bf7f9fd525

import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    average_precision_score,
)
df = pd.read_csv('../data/Suicide_Detection.csv')

In [2]:
posts = df['text']
labels = df['class'].apply(lambda x: 1 if x == 'suicide' else 0)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(posts)
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = LogisticRegression().fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]



In [3]:
threshold = 0.2
y_pred = (y_pred_proba >= threshold)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
avg_precision = average_precision_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity/True Positive Rate): {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Average Precision Score (PR AUC): {avg_precision:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

Accuracy: 0.9127
Precision: 0.8657
Recall (Sensitivity/True Positive Rate): 0.9768
F1-Score: 0.9179
ROC AUC Score: 0.9127
Average Precision Score (PR AUC): 0.8572

Confusion Matrix:
[[24633  4393]
 [  674 28319]]


In [4]:

vectorized = vectorizer.transform(["""I am really sad. help me overcome this sadness"""])
predict_probas = model.predict_proba(vectorized)
bools = (predict_probas >= threshold)
print('with threshold: ',bools)
print('with threshold: ',predict_probas)
if all(bools[0]):
    if predict_probas[0][0] > predict_probas[0][1]:
        print('with threshold: ', [0])
    else: print('with threshold: ',[1])
elif all(bool == False for bool in bools[0]):
    print('Both False')
elif any(bools[0]):
    if predict_probas[0][0] > predict_probas[0][1]:
        print('with threshold: ',[0])
    else: print('with threshold: ',[1])
print('without threshold: ',model.predict(vectorized))


with threshold:  [[ True  True]]
with threshold:  [[0.66096499 0.33903501]]
with threshold:  [0]
without threshold:  [0]
