# **----- Import Libraries -----**

In [1]:
import pandas as pd
from tqdm import tqdm
import ast
import pickle

# import train test split
from sklearn.model_selection import train_test_split

# import the vectorizers, scaler, smote
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# import the model including logistic regression, decision tree, bagged decision tree, random forest, ada boost, gradient boost, xgboost, svm 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# import grid search cv
from sklearn.model_selection import GridSearchCV

# import make scorer
from sklearn.metrics import make_scorer

# import the metrics including accuracy score, precision score, recall score, f1 score, roc auc score, confusion matrix as well as classification report, roc curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve



# **----- Import Data -----**

In [2]:
#import the file "df_pantip_posts_suicide_labeled_processed.csv" from output folder
df_pantip_posts_suicide_labeled_processed = pd.read_csv('../output - final/df_pantip_posts_suicide_labeled_processed.csv')

# **----- Define X,y and Do Train-test Split -----**

In [3]:
# list columns
print(df_pantip_posts_suicide_labeled_processed.columns)

Index(['Unnamed: 0', 'index', 'title', 'url', 'label', 'tags', 'text',
       'profile', 'time', 'sub_label', 'all_text', 'tokenized_text',
       'day_week', 'day_month', 'month_year', 'year', 'time_day', 'title_len',
       'text_len', 'title_emoji', 'text_emoji'],
      dtype='object')


In [4]:
# check NaN
df_pantip_posts_suicide_labeled_processed.isnull().sum()

Unnamed: 0        0
index             0
title             0
url               0
label             0
tags              0
text              0
profile           0
time              0
sub_label         0
all_text          0
tokenized_text    0
day_week          0
day_month         0
month_year        0
year              0
time_day          0
title_len         0
text_len          0
title_emoji       0
text_emoji        0
dtype: int64

In [5]:
# Filter the DataFrame to include only rows where 'sub_label' is 11 or 12
filtered_df = df_pantip_posts_suicide_labeled_processed[df_pantip_posts_suicide_labeled_processed['sub_label'].isin([11, 12])]

# Define 'X' to contain only the 'sub_label' column of the filtered DataFrame
y = filtered_df[['sub_label']]

# Define 'y' to include the specified columns of the filtered DataFrame
X = filtered_df.drop(columns=['Unnamed: 0', 'tags', 'index', 'title', 'url', 'label', 'text','all_text',
       'profile', 'time', 'title_len','title_emoji' ])

# Print the columns of 'X' and 'y'
print("Columns in X:", X.columns)
print('shape in X:', X.shape)
print("Columns in y:", y.columns)
print('shape in y', y.shape)
print(y.value_counts())

Columns in X: Index(['sub_label', 'tokenized_text', 'day_week', 'day_month', 'month_year',
       'year', 'time_day', 'text_len', 'text_emoji'],
      dtype='object')
shape in X: (1261, 9)
Columns in y: Index(['sub_label'], dtype='object')
shape in y (1261, 1)
sub_label
11           779
12           482
Name: count, dtype: int64


In [6]:
# transform value in column 'sub_label'
# 11 = 1
# 12 = 0

#create mapping value
mapping_value = {11:1, 12:0}

#map value on column 'sub_label'(
y['sub_label'] = y['sub_label'].map(mapping_value)

#show result 
y['sub_label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['sub_label'] = y['sub_label'].map(mapping_value)


sub_label
1    779
0    482
Name: count, dtype: int64

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42, stratify= y)

# **----- Feature Engineering -----**

In [8]:
#Step 1: One-Hot Encoding for Categorical Data

cols = ['day_week', 'day_month', 'month_year', 'year', 'time_day','text_emoji']
X_train_categorical = pd.get_dummies(X_train[cols], columns=cols, drop_first=True, dtype=int)
X_test_categorical = pd.get_dummies(X_test[cols], columns=cols, drop_first=True, dtype=int)

print(X_train_categorical.columns)
print(type(X_train_categorical))
print(X_train_categorical.shape)

Index(['day_week_Monday', 'day_week_Saturday', 'day_week_Sunday',
       'day_week_Thursday', 'day_week_Tuesday', 'day_week_Wednesday',
       'day_month_2', 'day_month_3', 'day_month_4', 'day_month_5',
       'day_month_6', 'day_month_7', 'day_month_8', 'day_month_9',
       'day_month_10', 'day_month_11', 'day_month_12', 'day_month_13',
       'day_month_14', 'day_month_15', 'day_month_16', 'day_month_17',
       'day_month_18', 'day_month_19', 'day_month_20', 'day_month_21',
       'day_month_22', 'day_month_23', 'day_month_24', 'day_month_25',
       'day_month_26', 'day_month_27', 'day_month_28', 'day_month_29',
       'day_month_30', 'day_month_31', 'month_year_August',
       'month_year_December', 'month_year_February', 'month_year_January',
       'month_year_July', 'month_year_June', 'month_year_March',
       'month_year_May', 'month_year_November', 'month_year_October',
       'month_year_September', 'year_2017', 'year_2018', 'year_2019',
       'year_2020', 'year_2021', 'y

In [9]:
#Step 2: Vectorization of 'tokenized_text'

vectorizer = TfidfVectorizer()
X_train_text = vectorizer.fit_transform(X_train['tokenized_text'])
X_test_text = vectorizer.transform(X_test['tokenized_text'])

print(type(X_train_text))
print(X_train_text.shape)

print(type(X_test_text))
print(X_test_text.shape)

<class 'scipy.sparse._csr.csr_matrix'>
(1008, 4678)
<class 'scipy.sparse._csr.csr_matrix'>
(253, 4678)


In [10]:
# Step 3: Scaling of Numerical Data

scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(X_train[['text_len']])
X_test_numerical = scaler.transform(X_test[['text_len']])

print(type(X_train_numerical))
X_train_numerical.shape

<class 'numpy.ndarray'>


(1008, 1)

In [11]:
# Step 4: Concatenate All Features for Training Data

X_train_all = pd.concat([
    pd.DataFrame(X_train_numerical, columns=['text_len'], index=X_train.index),
    pd.DataFrame(X_train_text.todense(), columns=vectorizer.get_feature_names_out(), index=X_train.index),
    X_train_categorical
], axis=1)

print(X_train_all.shape)


(1008, 4741)


In [12]:
# Step 4: Concatenate All Features for Test Data
X_test_all = pd.concat([
    pd.DataFrame(X_test_numerical, columns=['text_len'], index=X_test.index),
    pd.DataFrame(X_test_text.todense(), columns=vectorizer.get_feature_names_out(), index=X_test.index),
    X_test_categorical
], axis=1)

print(X_test_all.shape)

(253, 4741)


In [13]:
# Step 5: smote X_train and y_train
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train_all, y_train)

print(X_train_sm.shape)


(1246, 4741)


In [14]:
# Step 6: Checking and Renaming Duplicate Features

# Function to rename duplicate columns by adding a suffix
def rename_duplicates(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

# Apply it to your training data
rename_duplicates(X_train_sm)

# If you're transforming your test data similarly, apply it there too
rename_duplicates(X_test_all)

Unnamed: 0,text_len,__,_อ,admid,adobe,advice,alert,all,alone,amilykatze,...,year_2022,year_2023,time_day_12-15,time_day_15-18,time_day_18-21,time_day_21-24,time_day_3-6,time_day_6-9,time_day_9-12,text_emoji_1
14461,0.285336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
23799,-0.525559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
23356,-0.373309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,1,0,0,0,0,0
7864,0.177768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
14118,-0.558657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17989,-0.594237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
15986,0.392076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
8571,-0.474257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9171,-0.403925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0


## **Do the simple modeling & evalutaing to test computing time**

## Logistic Regression

In [15]:
# Run Logistic Regression Model to test compute time
lr = LogisticRegression(random_state=42, class_weight={1:1, 0:1})
lr.fit(X_train_sm, y_train_sm)
y_pred_lr = lr.predict(X_test_all)
y_pred_lr_proba = lr.predict_proba(X_test_all)[:,1]


  y = column_or_1d(y, warn=True)


In [16]:
# Set threshold for Logistic Regression

def get_pred_by_proba(proba, threshold=0.5):
    return [1 if p >= threshold else 0 for p in proba]


y_pred_lr = get_pred_by_proba(y_pred_lr_proba, threshold=0.5)

In [17]:
# evaluate the model both test and train model comparison for Logistic Regression
print('Logistic Regression')
print('ROC AUC score: ', roc_auc_score(y_test, y_pred_lr))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred_lr))
print('Classification Report: \n', classification_report(y_test, y_pred_lr))


Logistic Regression
ROC AUC score:  0.7517182130584195
Confusion Matrix: 
 [[ 65  32]
 [ 26 130]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.71      0.67      0.69        97
           1       0.80      0.83      0.82       156

    accuracy                           0.77       253
   macro avg       0.76      0.75      0.75       253
weighted avg       0.77      0.77      0.77       253



## Decision Tree

In [21]:
# Run Decision Tree Model

dt = DecisionTreeClassifier(class_weight={0:1, 1:3}, random_state=42)
dt.fit(X_train_sm, y_train_sm)
y_pred_dt = dt.predict(X_test_all)
y_pred_dt_proba = dt.predict_proba(X_test_all)[:,1]


In [22]:
# Set threshold

def get_pred_by_proba(proba, threshold=0.5):
    return [1 if p >= threshold else 0 for p in proba]

y_pred_dt = get_pred_by_proba(y_pred_dt_proba, threshold=0.1)

In [23]:
# evaluate the model both test and train model comparison

print('Decision Tree')
print('Accuracy score: ', accuracy_score(y_test, y_pred_dt))
print('Precision score: ', precision_score(y_test, y_pred_dt))
print('Recall score: ', recall_score(y_test, y_pred_dt))
print('F1 score: ', f1_score(y_test, y_pred_dt))
print('ROC AUC score: ', roc_auc_score(y_test, y_pred_dt))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred_dt))
print('Classification Report: \n', classification_report(y_test, y_pred_dt))


Decision Tree
Accuracy score:  0.8023715415019763
Precision score:  0.8486842105263158
Recall score:  0.8269230769230769
F1 score:  0.8376623376623378
ROC AUC score:  0.7949048374306106
Confusion Matrix: 
 [[ 74  23]
 [ 27 129]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.73      0.76      0.75        97
           1       0.85      0.83      0.84       156

    accuracy                           0.80       253
   macro avg       0.79      0.79      0.79       253
weighted avg       0.80      0.80      0.80       253

