In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
from scipy.sparse import csr_matrix



In [2]:
df=pd.read_csv('Suicide_Detection.csv')

In [3]:
# Encode Labels
label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])  # "suicide" → 1, "no suicide" → 0

print(df['class'].head(5))

0    1
1    0
2    0
3    1
4    1
Name: class, dtype: int32


In [4]:
# Step 2️⃣: Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features to reduce memory usage
X = vectorizer.fit_transform(df["text"])  # Convert text to numeric TF-IDF vectors
y = df['class'].values

In [5]:
# Split Data into Training and Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Step 4️⃣: Convert sparse matrix to CSR format (optimized for memory)
X_train = csr_matrix(X_train)  # Convert to compressed sparse row (CSR) format
X_test = csr_matrix(X_test)    # Do the same for test data

In [7]:
# Step 5️⃣: Convert to XGBoost DMatrix (ensuring proper format)
dtrain = xgb.DMatrix(X_train, label=y_train)  # Create training DMatrix
dtest = xgb.DMatrix(X_test, label=y_test)  # Create test DMatrix

In [8]:
params = {
    'objective': 'binary:logistic',  # Logistic regression for binary classification
    'eval_metric': 'logloss',  # Log loss for binary classification
    'max_depth': 3,
    'eta': 0.3,
    'seed': 42
}


# Define the XGBClassifier with parameters passed as arguments
model = xgb.XGBClassifier(
    objective='binary:logistic',  # Binary classification
    eval_metric='logloss',  # Evaluation metric
    max_depth=3,
    learning_rate=0.3,  # Use `learning_rate` instead of `eta`
    seed=42
)

# Train the model
model.fit(X_train, y_train)

In [9]:


# Train the model using XGBoost's train method
model = xgb.train(params, dtrain, num_boost_round=100)

# Predict on Test Set
y_pred = model.predict(X_test)
y_pred

In [10]:

# Make predictions
y_pred = model.predict(dtest)
y_pred

array([0.94122016, 0.0821275 , 0.99589586, ..., 0.07197011, 0.9928508 ,
       0.17581794], dtype=float32)

**Accuracy Report for XGB**

In [11]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

# Convert probabilities to class labels
y_pred_labels = np.where(y_pred > 0.5, 1, 0)  

# Evaluate model
accuracy = accuracy_score(y_test, y_pred_labels)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred_labels))


Accuracy: 0.91
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.94      0.91     23134
           1       0.93      0.87      0.90     23291

    accuracy                           0.91     46425
   macro avg       0.91      0.91      0.91     46425
weighted avg       0.91      0.91      0.91     46425



**Save the model & vectorizer using joblib**

In [12]:
import joblib

# Save the trained model
joblib.dump(model, "suicide_prediction_xgboost.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")  

print("Model saved successfully!")


Model saved successfully!


### GridSearchCV model selection for better accuracy

**define the parameters for better tunning**

In [13]:
# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

In [14]:
# Initialize model
from sklearn.model_selection import GridSearchCV


xgb_clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', seed=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(xgb_clf, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)


Fitting 3 folds for each of 27 candidates, totalling 81 fits


Best parameters found:  {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300}


**Find the best tunning model**

In [15]:
# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_
print(best_model)

# Make predictions
y_pred = best_model.predict(X_test)
print("\n",y_pred)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None,
              num_parallel_tree=None, ...)

 [1 0 1 ... 0 1 0]


**Accuracy Report for grid**
**option: if better performance compare to previous model means save th e model and use it**

In [16]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.93


In [17]:
y_pred_labels = np.where(y_pred > 0.5, 1, 0)  

# Evaluate model
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93     23134
           1       0.94      0.91      0.93     23291

    accuracy                           0.93     46425
   macro avg       0.93      0.93      0.93     46425
weighted avg       0.93      0.93      0.93     46425



In [18]:
import joblib

# Save the trained model
joblib.dump(best_model, "suicide_prediction_xgboostgrid.pkl")

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")  

print("Model saved successfully!")

Model saved successfully!


### RandomSearchCV model selection for better accuracy

**define the parameters for better tunning**

In [19]:
# Define the hyperparameter search space
param_dist = {
    'max_depth': np.arange(3, 10, 2),  # Try depths from 3 to 9
    'learning_rate': np.linspace(0.01, 0.3, 10),  # Try different learning rates
    'n_estimators': np.arange(50, 500, 50),  # Try estimators from 50 to 450
    'subsample': [0.6, 0.8, 1.0],  # Randomly sample training data
    'colsample_bytree': [0.6, 0.8, 1.0],  # Features sampled per tree
    'gamma': [0, 0.1, 0.2, 0.3],  # Minimum loss reduction
    'reg_alpha': [0, 0.01, 0.1, 1.0],  # L1 regularization
    'reg_lambda': [0, 0.01, 0.1, 1.0],  # L2 regularization
    'tree_method': ['gpu_hist'],  # Use histogram-based training (efficient for large data)
    'gpu_id' : [0]
}


**define and find the model**

In [20]:
# Initialize XGBoost with CPU optimization
xgb_clf_rand = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    tree_method='hist'  # Use CPU-based histogram algorithm
)

In [21]:
# Define parameter grid
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_clf_rand,
    param_distributions=param_dist,
    n_iter=20,  # Number of random combinations to try
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',
    verbose=1,
    n_jobs=-1,  # Use all CPU cores
    random_state=42
)

# Fit the model
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


14 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 1682, in fit
    self._Booster = train(
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.

In [22]:
# Best parameters
print("Best parameters found: ", random_search.best_params_)

Best parameters found:  {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.2, 'colsample_bytree': 0.8}


**Find the best tunning model**

In [23]:
# Get the best model from GridSearchCV
best_model2 = random_search.best_estimator_
print(best_model2)

# Make predictions
y_pred = best_model2.predict(X_test)
print("\n",y_pred)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=500, n_jobs=None,
              num_parallel_tree=None, ...)

 [1 0 1 ... 0 1 0]


**Accuracy Report for random search**
**option: if better performance compare to previous model means save th e model and use it**

In [24]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.93


In [25]:
y_pred_labels = np.where(y_pred > 0.5, 1, 0)  

# Evaluate model
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.95      0.93     23134
           1       0.94      0.92      0.93     23291

    accuracy                           0.93     46425
   macro avg       0.93      0.93      0.93     46425
weighted avg       0.93      0.93      0.93     46425

