# **----- Import Libraries -----**

In [1]:
import pandas as pd
import ast, json

# import the libraries for importing the model 
import pickle

# import train test split
from sklearn.model_selection import train_test_split

# import the vectorizers, scaler, smote
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# import the model including logistic regression, decision tree, bagged decision tree, random forest, ada boost, gradient boost, xgboost, svm 
from sklearn.linear_model import LogisticRegression

# import the metrics including accuracy score, precision score, recall score, f1 score, roc auc score, confusion matrix as well as classification report, roc curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve



# **----- Import Model -----**

In [2]:
#import the file '../model/pantip_post_model_train_simplified_label_lr.pkl', so we can use the model to predict the new data
with open('../model/pantip_post_model_train_simplified_label_lr.pkl', 'rb') as file:
    pantip_post_model_train_simplified_label_lr = pickle.load(file)

# import the file '../model/pantip_post_model_train_simplified_label_vectorizer.pkl', so we can use the vectorizer to transform the new data
with open('../model/pantip_post_model_train_simplified_label_vectorizer.pkl', 'rb') as file:
    pantip_post_model_train_simplified_label_vectorizer = pickle.load(file)

# import the file '../model/pantip_post_model_train_simplified_label_scaler.pkl', so we can use the scaler to transform the new data
with open('../model/pantip_post_model_train_simplified_label_scaler.pkl', 'rb') as file:
    pantip_post_model_train_simplified_label_scaler = pickle.load(file)

# **----- Import Data -----**

In [3]:
#import the data from '../output - final/df_youtube_comment_suicide_labeled_processed.csv', so we can use the model to predict the new data
df_youtube_comment_suicide_labeled_processed = pd.read_csv('../output - final/df_youtube_comment_suicide_labeled_processed.csv')

# import the data from '../model/X_train_sm_columns.csv', so we can use the data in the dataframe to match the X_sm and use the model 'pantip_post_model_train_simplified_label_lr.pkl' to predict
with open('../model/X_train_sm_label_columns.json') as f:
    X_train_sm_columns = json.load(f)

# **----- Define X,y  -----**

In [4]:
# list the column of the dataframe df_youtube_comment_suicide_labeled_processed 
print(df_youtube_comment_suicide_labeled_processed.shape)
print(df_youtube_comment_suicide_labeled_processed.columns)
print(df_youtube_comment_suicide_labeled_processed['label'].value_counts())
print(df_youtube_comment_suicide_labeled_processed['sub_label'].value_counts())

(13360, 15)
Index(['Unnamed: 0', 'videoId', 'title', 'comment', 'date', 'label',
       'sub_label', 'tokenized_text', 'day_week', 'day_month', 'month_year',
       'year', 'time_day', 'text_len', 'text_emoji'],
      dtype='object')
label
0    12883
1      477
Name: count, dtype: int64
sub_label
0     12883
12      271
11      206
Name: count, dtype: int64


In [5]:
# Define X,y which include the following columns ['tokenized_text', 'day_week', 'day_month', 'month_year', 'year', 'time_day', 'text_len', 'text_emoji']

X = df_youtube_comment_suicide_labeled_processed[['tokenized_text', 'day_week', 'day_month', 'month_year', 'year', 'time_day', 'text_len', 'text_emoji']]
y = df_youtube_comment_suicide_labeled_processed['label']

# Check X,y
print(X.shape)
print(y.shape)
print(y.head())
X.head()


(13360, 8)
(13360,)
0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64


Unnamed: 0,tokenized_text,day_week,day_month,month_year,year,time_day,text_len,text_emoji
0,['ขอบคุณ'],Sunday,26,November,2023,9-12,10,0
1,"['อย่า', 'ยึด', 'ติดกับ', 'ความดี', 'ตีเส้น', ...",Friday,10,November,2023,21-24,130,0
2,['ขอบคุณ'],Thursday,7,September,2023,9-12,14,1
3,"['อีฟ', 'อ่านใจ']",Monday,7,August,2023,3-6,37,0
4,['ขอบคุณ'],Sunday,23,July,2023,3-6,13,0


# **----- Feature Engineering  -----**

In [6]:
#Step 1: One-Hot Encoding for Categorical Data

cols = ['day_week', 'day_month', 'month_year', 'year', 'time_day', 'text_emoji']
X_categorical = pd.get_dummies(X[cols], columns=cols, drop_first=True, dtype=int)

print(X_categorical.columns)
print(type(X_categorical))
print(X_categorical.shape)

Index(['day_week_Monday', 'day_week_Saturday', 'day_week_Sunday',
       'day_week_Thursday', 'day_week_Tuesday', 'day_week_Wednesday',
       'day_month_2', 'day_month_3', 'day_month_4', 'day_month_5',
       'day_month_6', 'day_month_7', 'day_month_8', 'day_month_9',
       'day_month_10', 'day_month_11', 'day_month_12', 'day_month_13',
       'day_month_14', 'day_month_15', 'day_month_16', 'day_month_17',
       'day_month_18', 'day_month_19', 'day_month_20', 'day_month_21',
       'day_month_22', 'day_month_23', 'day_month_24', 'day_month_25',
       'day_month_26', 'day_month_27', 'day_month_28', 'day_month_29',
       'day_month_30', 'day_month_31', 'month_year_August',
       'month_year_December', 'month_year_February', 'month_year_January',
       'month_year_July', 'month_year_June', 'month_year_March',
       'month_year_May', 'month_year_November', 'month_year_October',
       'month_year_September', 'year_2022', 'year_2023', 'time_day_12-15',
       'time_day_15-18', 'time

In [7]:
#Step 2: use the imported vectorizer to vecdtorize the data

X_text = pantip_post_model_train_simplified_label_vectorizer.transform(X['tokenized_text'])

print(type(X_text))
X_text.shape

<class 'scipy.sparse._csr.csr_matrix'>


(13360, 17505)

In [8]:
# Step 3: use the imported scaler to scaling the data

X_numerical = pantip_post_model_train_simplified_label_scaler.transform(X[['text_len']])

print(type(X_numerical))
X_numerical.shape

<class 'numpy.ndarray'>


(13360, 1)

In [9]:
# Step 4: Concatenate All Features for Training Data

X_all = pd.concat([
    pd.DataFrame(X_numerical, columns=['text_len'], index=X.index),
    pd.DataFrame(X_text.todense(), columns=pantip_post_model_train_simplified_label_vectorizer.get_feature_names_out(), index=X.index),
    X_categorical
], axis=1)

print(X_all.shape)

(13360, 17563)


In [10]:
# Step 5: Checking and Renaming Duplicate Features

# Function to rename duplicate columns by adding a suffix
def rename_duplicates(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

# Apply it to your training data
rename_duplicates(X_all)

# check the shape of X_all
print(X_all.shape)

(13360, 17563)


In [11]:
# step 6: match the X_all columns with X_train_sm_columns, 
# so we can use X_all to predict the new data, using the trained model 'pantip_post_model_train_simplified_label_lr.pkl'

# check the shape of X_sm and X_train_sm_columns
print(X_all.shape)
X_train_sm_columns

(13360, 17563)


['text_len',
 '__',
 '___',
 '____',
 '_____',
 '________',
 '_________',
 '__________',
 '____________',
 '__________________',
 '________________________',
 '_____________________________________',
 '______________________________________',
 '___________________________________________',
 '____________________________________________',
 '____________________________________________________',
 '______________________________________________________',
 '______________________________________________________________',
 '_______________________________________________________________',
 '______________________________________________________________________',
 '____________________________________________________________________________________',
 '_____________________________________________________________________________________________',
 '____________________________________________________________________________________________________',
 '________________________________________

In [12]:
# match the X_all.columns with columns_names_list 
# because columns_names_list is the list of columns of X_train_sm_columns which is used to train the model 'pantip_post_model_train_simplified_label_lr.pkl'
# column_names_list is the feature list of the trained model 'pantip_post_model_train_simplified_label_lr.pkl'
# x_all number of columns should the same as column_names_list's length

# 1. Adding missing columns from column_names_list to X_sm and imputing with 0
for col in X_train_sm_columns:
    if col not in X_all.columns:
        X_all[col] = 0

# 2. Dropping columns in X_sm that are not in column_names_list
X_all = X_all[X_train_sm_columns]

# This results in X_sm having the same columns as column_names_list, in the same order.
# Missing columns are added and imputed with 0, and extra columns are dropped.

# check the shape of X_all
print(X_all.shape)

(13360, 17568)


In [13]:
# check whether all column names in X_sm are strings. If all are strings, it will return true
print(X_all.columns.dtype == 'object')

True


In [14]:
X_all

Unnamed: 0,text_len,__,___,____,_____,________,_________,__________,____________,__________________,...,year_2022,year_2023,time_day_12-15,time_day_15-18,time_day_18-21,time_day_21-24,time_day_3-6,time_day_6-9,time_day_9-12,text_emoji_1
0,-0.768740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
1,-0.672877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,1,0,0,0,0
2,-0.765545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,1
3,-0.747171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,0,0,0
4,-0.766343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13355,-0.679268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,1,0,0,0,0
13356,-0.517898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,1,0,0,0,0
13357,-0.541864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
13358,-0.211935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,1,0,0,0,0


# **----- Use the trained model to test the data -----**

In [15]:
# the trained model to test the processed data X_sm and y_sm
y_pred = pantip_post_model_train_simplified_label_lr.predict(X_all)



In [16]:
#evaluate the test model using the metrics including accuracy score, precision score, recall score, f1 score, roc auc score, confusion matrix as well as classification report, roc curve
print('accuracy score: ', accuracy_score(y, y_pred))
print('precision score: ', precision_score(y, y_pred))
print('recall score: ', recall_score(y, y_pred))
print('f1 score: ', f1_score(y, y_pred))
print('roc auc score: ', roc_auc_score(y, y_pred))
print('confusion matrix: ', confusion_matrix(y, y_pred))
print('classification report: ', classification_report(y, y_pred))
print('roc curve: ', roc_curve(y, y_pred))


accuracy score:  0.9449850299401198
precision score:  0.3379396984924623
recall score:  0.5639412997903563
f1 score:  0.42262372348782407
roc auc score:  0.7615173393308686
confusion matrix:  [[12356   527]
 [  208   269]]
classification report:                precision    recall  f1-score   support

           0       0.98      0.96      0.97     12883
           1       0.34      0.56      0.42       477

    accuracy                           0.94     13360
   macro avg       0.66      0.76      0.70     13360
weighted avg       0.96      0.94      0.95     13360

roc curve:  (array([0.        , 0.04090662, 1.        ]), array([0.       , 0.5639413, 1.       ]), array([inf,  1.,  0.]))
