# **----- Import Libraries -----**

In [4]:
import pandas as pd
from tqdm import tqdm
import ast

# import train test split
from sklearn.model_selection import train_test_split

# import the vectorizers, scaler, smote
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# import the model including logistic regression, decision tree, bagged decision tree, random forest, ada boost, gradient boost, xgboost, svm 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# import grid search cv
from sklearn.model_selection import GridSearchCV

# import make scorer
from sklearn.metrics import make_scorer

# import the metrics including accuracy score, precision score, recall score, f1 score, roc auc score, confusion matrix as well as classification report, roc curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve



# **----- Import Data -----**

In [5]:
#import the file "df_pantip_posts_suicide_labeled_processed.csv" from output folder
df_pantip_posts_suicide_labeled_processed = pd.read_csv('../output - final/df_pantip_posts_suicide_labeled_processed.csv')

# **----- Define X,y and Do Train-test Split -----**

In [6]:
# check column and data type of the dataframe df_pantip_posts_suicide_labeled_processed
print(df_pantip_posts_suicide_labeled_processed.dtypes)

Unnamed: 0         int64
index              int64
title             object
url               object
label              int64
tags              object
text              object
profile           object
time              object
sub_label          int64
all_text          object
tokenized_text    object
day_week          object
day_month          int64
month_year        object
year               int64
time_day          object
title_len          int64
text_len           int64
title_emoji        int64
text_emoji         int64
dtype: object


In [7]:
df_pantip_posts_suicide_labeled_processed.drop(columns=['Unnamed: 0'], inplace = True)

In [8]:
# check NaN
df_pantip_posts_suicide_labeled_processed.isnull().sum()

index             0
title             0
url               0
label             0
tags              0
text              0
profile           0
time              0
sub_label         0
all_text          0
tokenized_text    0
day_week          0
day_month         0
month_year        0
year              0
time_day          0
title_len         0
text_len          0
title_emoji       0
text_emoji        0
dtype: int64

In [9]:
# define X including time category feature, text numeric feature, text category features, tags, and tokenized text, and define y which is column 'label
y = df_pantip_posts_suicide_labeled_processed['label']
X = df_pantip_posts_suicide_labeled_processed.drop(columns=['index', 'title', 'url', 'label', 'text','all_text',
       'profile', 'time'])
print(X.columns)

Index(['tags', 'sub_label', 'tokenized_text', 'day_week', 'day_month',
       'month_year', 'year', 'time_day', 'title_len', 'text_len',
       'title_emoji', 'text_emoji'],
      dtype='object')


In [18]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42, stratify= y)

# **----- Feature Engineering -----**

In [19]:
#Step 1: One-Hot Encoding for Categorical Data

cols = ['day_week', 'day_month', 'month_year', 'year', 'time_day', 'title_emoji', 'text_emoji']
X_train_categorical = pd.get_dummies(X_train[cols], columns=cols, drop_first=True, dtype=int)
X_test_categorical = pd.get_dummies(X_test[cols], columns=cols, drop_first=True, dtype=int)

print(X_train_categorical.columns)
print(type(X_train_categorical))
print(X_train_categorical.shape)

Index(['day_week_Monday', 'day_week_Saturday', 'day_week_Sunday',
       'day_week_Thursday', 'day_week_Tuesday', 'day_week_Wednesday',
       'day_month_2', 'day_month_3', 'day_month_4', 'day_month_5',
       'day_month_6', 'day_month_7', 'day_month_8', 'day_month_9',
       'day_month_10', 'day_month_11', 'day_month_12', 'day_month_13',
       'day_month_14', 'day_month_15', 'day_month_16', 'day_month_17',
       'day_month_18', 'day_month_19', 'day_month_20', 'day_month_21',
       'day_month_22', 'day_month_23', 'day_month_24', 'day_month_25',
       'day_month_26', 'day_month_27', 'day_month_28', 'day_month_29',
       'day_month_30', 'day_month_31', 'month_year_August',
       'month_year_December', 'month_year_February', 'month_year_January',
       'month_year_July', 'month_year_June', 'month_year_March',
       'month_year_May', 'month_year_November', 'month_year_October',
       'month_year_September', 'year_2017', 'year_2018', 'year_2019',
       'year_2020', 'year_2021', 'y

In [20]:
# Step 2: Special Treatment for 'tags' Column

# Function to parse the string representation of a list
def parse_tags(tag_str):
    return ast.literal_eval(tag_str)

X_train_tag = pd.DataFrame(columns=["parsed_tags"])
X_test_tag = pd.DataFrame(columns=["parsed_tags"])

# Process 'tags' for both train and test sets
X_train_tag['parsed_tags'] = X_train['tags'].apply(parse_tags)
X_test_tag['parsed_tags'] = X_test['tags'].apply(parse_tags)

# Extract and flatten all tags from the training set
all_tags = set(tag for sublist in X_train_tag['parsed_tags'] for tag in sublist)

# Create one-hot encoding for each tag in both datasets
for tag in all_tags:
    X_train_tag[tag] = X_train_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
    X_test_tag[tag] = X_test_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)

X_train_tag.drop(columns=['parsed_tags'], inplace=True)
X_test_tag.drop(columns=['parsed_tags'], inplace=True)

print(X_train_tag.columns)
print(X_train_tag.shape)

  X_train_tag[tag] = X_train_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
  X_test_tag[tag] = X_test_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
  X_train_tag[tag] = X_train_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
  X_test_tag[tag] = X_test_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
  X_train_tag[tag] = X_train_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
  X_test_tag[tag] = X_test_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
  X_train_tag[tag] = X_train_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
  X_test_tag[tag] = X_test_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
  X_train_tag[tag] = X_train_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
  X_test_tag[tag] = X_test_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0)
  X_train_tag[tag] = X_train_tag['parsed_tags'].apply(lambda tags: 1 if tag in tags else 0

Index(['แต่งนิทาน', 'สโมสรฟุตบอลอังกฤษ', 'ข้อเสนอแนะถึงพันทิป', 'นักศึกษา',
       'Joker (ภาพยนตร์)', 'รายการข่าว', 'ศีล 5', 'ติวเตอร์',
       'สหภาพแรงงาน (Trade Union)', 'วัตถุอันตราย',
       ...
       'ห้องทำงาน', 'แต่งหน้า', 'จังหวัดลพบุรี', 'ศิลปะประยุกต์ (Applied Art)',
       'โรคความดัน', 'มหาสติปัฏฐาน 4', 'ซีรีย์', 'กระต่าย', 'กรุงเทพมหานคร',
       'เรื่องเล่าจากผู้สูงอายุ'],
      dtype='object', length=941)
(19067, 941)


In [21]:
#Step 3: Vectorization of 'tokenized_text'

vectorizer = TfidfVectorizer()
X_train_text = vectorizer.fit_transform(X_train['tokenized_text'])
X_test_text = vectorizer.transform(X_test['tokenized_text'])

print(type(X_train_text))
print(X_train_text.shape)

print(type(X_test_text))
print(X_test_text.shape)

<class 'scipy.sparse._csr.csr_matrix'>
(19067, 17505)
<class 'scipy.sparse._csr.csr_matrix'>
(4767, 17505)


In [22]:
# Step 4: Scaling of Numerical Data

scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(X_train[['title_len', 'text_len']])
X_test_numerical = scaler.transform(X_test[['title_len', 'text_len']])

print(type(X_train_numerical))
X_train_numerical.shape

<class 'numpy.ndarray'>


(19067, 2)

In [23]:
# Step 5: Concatenate All Features for Training Data

X_train_all = pd.concat([
    pd.DataFrame(X_train_numerical, columns=['title_len', 'text_len'], index=X_train.index),
    pd.DataFrame(X_train_text.todense(), columns=vectorizer.get_feature_names_out(), index=X_train.index),
    X_train_tag,
    X_train_categorical
], axis=1)

print(X_train_all.shape)


(19067, 18511)


In [24]:
# Step 5: Concatenate All Features for Test Data
X_test_all = pd.concat([
    pd.DataFrame(X_test_numerical, columns=['title_len', 'text_len'], index=X_test.index),
    pd.DataFrame(X_test_text.todense(), columns=vectorizer.get_feature_names_out(), index=X_test.index),
    X_test_tag,
    X_test_categorical
], axis=1)

print(X_test_all.shape)

(4767, 18511)


In [25]:
# Step 6: smote X_train and y_train
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train_all, y_train)

print(X_train_sm.shape)


(36116, 18511)


In [26]:
# Step 7: Checking and Renaming Duplicate Features

# Function to rename duplicate columns by adding a suffix
def rename_duplicates(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

# Apply it to your training data
rename_duplicates(X_train_sm)

# If you're transforming your test data similarly, apply it there too
rename_duplicates(X_test_all)

Unnamed: 0,title_len,text_len,__,___,____,_____,________,_________,__________,____________,...,year_2023,time_day_12-15,time_day_15-18,time_day_18-21,time_day_21-24,time_day_3-6,time_day_6-9,time_day_9-12,title_emoji_1,text_emoji_1
22119,-0.486915,-0.526686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
13481,2.447432,1.550348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
5832,0.446741,0.141960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
23105,0.357821,-0.154417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,0,0,0,0,0,0
21166,-0.220156,-0.707228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1802,0.446741,-0.763148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
1646,-0.531375,-0.402064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
10575,0.580120,0.093229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
18182,0.446741,-0.607370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0


# **----- Modeling & Evaluating -----**

## Logistic Regression Modeling

In [27]:
# Run Logistic Regression Model to test compute time
lr = LogisticRegression(random_state=42, class_weight={0:1, 1:3})
lr.fit(X_train_sm, y_train_sm)
y_pred_lr = lr.predict(X_test_all)
y_pred_lr_proba = lr.predict_proba(X_test_all)[:,1]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
# Set threshold for Logistic Regression

def get_pred_by_proba(proba, threshold=0.5):
    return [1 if p >= threshold else 0 for p in proba]


y_pred_lr = get_pred_by_proba(y_pred_lr_proba, threshold=0.1)

In [33]:
# evaluate the model both test and train model comparison for Logistic Regression
print('Logistic Regression')
print('Accuracy score: ', accuracy_score(y_test, y_pred_lr))
print('Precision score: ', precision_score(y_test, y_pred_lr))
print('Recall score: ', recall_score(y_test, y_pred_lr))
print('F1 score: ', f1_score(y_test, y_pred_lr))
print('ROC AUC score: ', roc_auc_score(y_test, y_pred_lr))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred_lr))
print('Classification Report: \n', classification_report(y_test, y_pred_lr))


Logistic Regression
Accuracy score:  0.7860289490245438
Precision score:  0.1751269035532995
Recall score:  0.8214285714285714
F1 score:  0.28870292887029286
ROC AUC score:  0.8027408637873754
Confusion Matrix: 
 [[3540  975]
 [  45  207]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.78      0.87      4515
           1       0.18      0.82      0.29       252

    accuracy                           0.79      4767
   macro avg       0.58      0.80      0.58      4767
weighted avg       0.94      0.79      0.84      4767



## Decision Tree Modeling

In [34]:
# Run Decision Tree Model

dt = DecisionTreeClassifier(class_weight={0:1, 1:3}, random_state=42)
dt.fit(X_train_sm, y_train_sm)
y_pred_dt = dt.predict(X_test_all)
y_pred_dt_proba = dt.predict_proba(X_test_all)[:,1]


In [35]:
# Set threshold

def get_pred_by_proba(proba, threshold=0.5):
    return [1 if p >= threshold else 0 for p in proba]

y_pred_dt = get_pred_by_proba(y_pred_dt_proba, threshold=0.1)

In [36]:
# evaluate the model both test and train model comparison

print('Decision Tree')
print('Accuracy score: ', accuracy_score(y_test, y_pred_dt))
print('Precision score: ', precision_score(y_test, y_pred_dt))
print('Recall score: ', recall_score(y_test, y_pred_dt))
print('F1 score: ', f1_score(y_test, y_pred_dt))
print('ROC AUC score: ', roc_auc_score(y_test, y_pred_dt))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred_dt))
print('Classification Report: \n', classification_report(y_test, y_pred_dt))


Decision Tree
Accuracy score:  0.9053912313824208
Precision score:  0.26697892271662765
Recall score:  0.4523809523809524
F1 score:  0.3357879234167894
ROC AUC score:  0.6915282392026578
Confusion Matrix: 
 [[4202  313]
 [ 138  114]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.93      0.95      4515
           1       0.27      0.45      0.34       252

    accuracy                           0.91      4767
   macro avg       0.62      0.69      0.64      4767
weighted avg       0.93      0.91      0.92      4767

