In [2]:
# LINEAR SVM TRAINING

import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

data_file = "suicide_detection_rare_word_removed.csv"
df = pd.read_csv(data_file)

print("Dataset loaded successfully")

df["text"] = df["text"].astype(str)

X = df["text"]
y = df["class"]


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train/Test split completed")


tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF vectorization completed")


svm_model = LinearSVC(
    C=1.0
)

svm_model.fit(X_train_tfidf, y_train)

print("Linear SVM training completed")


y_pred = svm_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


joblib.dump(svm_model, "linear_svm_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer_svm.pkl")

print("\nLinear SVM model and vectorizer saved successfully")


Dataset loaded successfully
Train/Test split completed
TF-IDF vectorization completed
Linear SVM training completed

Accuracy: 0.9323287198723753

Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.93      0.94      0.93     23191
     suicide       0.94      0.92      0.93     23195

    accuracy                           0.93     46386
   macro avg       0.93      0.93      0.93     46386
weighted avg       0.93      0.93      0.93     46386


Linear SVM model and vectorizer saved successfully
