# **----- Import Libraries -----**

In [1]:
import pandas as pd
import ast, json

# import the libraries for importing the model 
import pickle

# import train test split
from sklearn.model_selection import train_test_split

# import the vectorizers, scaler, smote
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# import the model including logistic regression, decision tree, bagged decision tree, random forest, ada boost, gradient boost, xgboost, svm 
from sklearn.linear_model import LogisticRegression

# import the metrics including accuracy score, precision score, recall score, f1 score, roc auc score, confusion matrix as well as classification report, roc curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve



# **----- Import Model -----**

In [2]:
#import the file '../model/pantip_post_model_train_simplified_label_lr.pkl', so we can use the model to predict the new data
with open('../model/pantip_post_model_train_simplified_sublabel_lr.pkl', 'rb') as file:
    pantip_post_model_train_simplified_sublabel_lr = pickle.load(file)

# import the file '../model/pantip_post_model_train_simplified_label_vectorizer.pkl', so we can use the vectorizer to transform the new data
with open('../model/pantip_post_model_train_simplified_sublabel_vectorizer.pkl', 'rb') as file:
    pantip_post_model_train_simplified_sublabel_vectorizer = pickle.load(file)

# import the file '../model/pantip_post_model_train_simplified_label_scaler.pkl', so we can use the scaler to transform the new data
with open('../model/pantip_post_model_train_simplified_sublabel_scaler.pkl', 'rb') as file:
    pantip_post_model_train_simplified_sublabel_scaler = pickle.load(file)

# **----- Import Data -----**

In [3]:
#import the data from '../output - final/df_youtube_comment_suicide_labeled_processed.csv', so we can use the model to predict the new data
df_youtube_comment_suicide_labeled_processed = pd.read_csv('../output - final/df_youtube_comment_suicide_labeled_processed.csv')

# import the data from '../model/X_train_sm_columns.json', so we can use the data in the dataframe to match the X_sm and use the model 'pantip_post_model_train_simplified_label_lr.pkl' to predict
with open('../model/X_train_sm_sublabel_columns.json') as f:
    X_train_sm_columns = json.load(f)

# **----- Define X,y -----**

In [4]:
# list the column of the dataframe df_youtube_comment_suicide_labeled_processed 
print(df_youtube_comment_suicide_labeled_processed.shape)
print(df_youtube_comment_suicide_labeled_processed.columns)
df_youtube_comment_suicide_labeled_processed['sub_label'].value_counts()

(13360, 15)
Index(['Unnamed: 0', 'videoId', 'title', 'comment', 'date', 'label',
       'sub_label', 'tokenized_text', 'day_week', 'day_month', 'month_year',
       'year', 'time_day', 'text_len', 'text_emoji'],
      dtype='object')


sub_label
0     12883
12      271
11      206
Name: count, dtype: int64

In [5]:
# Filter the DataFrame to include only rows where 'sub_label' is 11 or 12
filtered_df = df_youtube_comment_suicide_labeled_processed[df_youtube_comment_suicide_labeled_processed['sub_label'].isin([11, 12])]

In [6]:
# Define X,y which include the following columns ['tokenized_text', 'day_week', 'day_month', 'month_year', 'year', 'time_day', 'text_len', 'text_emoji']

X = filtered_df[['tokenized_text', 'day_week', 'day_month', 'month_year', 'year', 'time_day', 'text_len', 'text_emoji']]
y = filtered_df['sub_label']

# Check X,y
print(X.shape)
print(y.shape)
print(y.head())
X.head()

(477, 8)
(477,)
12883    11
12884    11
12885    11
12886    11
12887    11
Name: sub_label, dtype: int64


Unnamed: 0,tokenized_text,day_week,day_month,month_year,year,time_day,text_len,text_emoji
12883,"['แม่', 'เหนื่อย', 'อารมณ์', 'แม่', 'คนใน', 'ค...",Sunday,30,October,2022,21-24,187,0
12884,"['ตาย', 'มาด', 'ตาย', 'เบื่อ', 'ครอบครัว', 'เบ...",Thursday,16,November,2023,18-21,109,0
12885,"['วิธี', 'รักษา', 'จำเป็นต้อง', 'คุย', 'โทรศัพ...",Thursday,15,June,2023,21-24,2151,0
12886,"['อันดับ', 'กระเพาะ', 'ท้องอืด', 'อาหาร', 'เปล...",Saturday,5,February,2022,15-18,201,0
12887,"['ขอบคุณ', 'รับฟัง', 'ช่อง', 'คิดสั้น']",Sunday,25,December,2022,9-12,51,0


# **----- Feature Engineering -----**

In [7]:
#Step 1: One-Hot Encoding for Categorical Data

cols = ['day_week', 'day_month', 'month_year', 'year', 'time_day', 'text_emoji']
X_categorical = pd.get_dummies(X[cols], columns=cols, drop_first=True, dtype=int)

print(X_categorical.columns)
print(type(X_categorical))
print(X_categorical.shape)

Index(['day_week_Monday', 'day_week_Saturday', 'day_week_Sunday',
       'day_week_Thursday', 'day_week_Tuesday', 'day_week_Wednesday',
       'day_month_2', 'day_month_3', 'day_month_4', 'day_month_5',
       'day_month_6', 'day_month_7', 'day_month_8', 'day_month_9',
       'day_month_10', 'day_month_11', 'day_month_12', 'day_month_13',
       'day_month_14', 'day_month_15', 'day_month_16', 'day_month_17',
       'day_month_18', 'day_month_19', 'day_month_20', 'day_month_21',
       'day_month_22', 'day_month_23', 'day_month_24', 'day_month_25',
       'day_month_26', 'day_month_27', 'day_month_28', 'day_month_29',
       'day_month_30', 'day_month_31', 'month_year_August',
       'month_year_December', 'month_year_February', 'month_year_January',
       'month_year_July', 'month_year_June', 'month_year_March',
       'month_year_May', 'month_year_November', 'month_year_October',
       'month_year_September', 'year_2022', 'year_2023', 'time_day_12-15',
       'time_day_15-18', 'time

In [8]:
#Step 2: use the imported vectorizer to vecdtorize the data

X_text = pantip_post_model_train_simplified_sublabel_vectorizer.transform(X['tokenized_text'])

print(type(X_text))
X_text.shape

<class 'scipy.sparse._csr.csr_matrix'>


(477, 4678)

In [9]:
# Step 3: use the imported scaler to scaling the data

X_numerical = pantip_post_model_train_simplified_sublabel_scaler.transform(X[['text_len']])

print(type(X_numerical))
X_numerical.shape

<class 'numpy.ndarray'>


(477, 1)

In [10]:
# Step 4: Concatenate All Features for Training Data

X_all = pd.concat([
    pd.DataFrame(X_numerical, columns=['text_len'], index=X.index),
    pd.DataFrame(X_text.todense(), columns=pantip_post_model_train_simplified_sublabel_vectorizer.get_feature_names_out(), index=X.index),
    X_categorical
], axis=1)

print(X_all.shape)


(477, 4736)


In [11]:
# Step 5: Checking and Renaming Duplicate Features

# Function to rename duplicate columns by adding a suffix
def rename_duplicates(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

# Apply it to your training data
rename_duplicates(X_all)

# check the shape of X_all
print(X_all.shape)

(477, 4736)


In [12]:
# step 6: match the X_all columns with X_train_sm_columns, 
# so we can use X_all to predict the new data, using the trained model 'pantip_post_model_train_simplified_label_lr.pkl'

# check the shape of X_sm and X_train_sm_columns
print(X_all.shape)
len(X_train_sm_columns)

(477, 4736)


4742

In [13]:
# match the X_all.columns with columns_names_list 
# because columns_names_list is the list of columns of X_train_sm_columns which is used to train the model 'pantip_post_model_train_simplified_label_lr.pkl'
# column_names_list is the feature list of the trained model 'pantip_post_model_train_simplified_label_lr.pkl'
# x_all number of columns should the same as column_names_list's length

# 1. Adding missing columns from column_names_list to X_sm and imputing with 0
for col in X_train_sm_columns:
    if col not in X_all.columns:
        X_all[col] = 0

# 2. Dropping columns in X_sm that are not in column_names_list
X_all = X_all[X_train_sm_columns]

# This results in X_sm having the same columns as column_names_list, in the same order.
# Missing columns are added and imputed with 0, and extra columns are dropped.

# check the shape of X_all
print(X_all.shape)

(477, 4742)


In [14]:
# check whether all column names in X_sm are strings. If all are strings, it will return true
print(X_all.columns.dtype == 'object')

True


In [15]:
# check X_all data
X_all

Unnamed: 0,text_len,__,_อ,admid,adobe,advice,alert,all,alone,amilykatze,...,year_2023,time_day_12-15,time_day_15-18,time_day_18-21,time_day_21-24,time_day_3-6,time_day_6-9,time_day_9-12,title_emoji_1,text_emoji_1
12883,-0.593409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
12884,-0.657950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,0,0,0,0,0,0
12885,1.031690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,1,0,0,0,0,0
12886,-0.581825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
12887,-0.705942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13355,-0.647193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
13356,-0.480049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
13357,-0.504873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
13358,-0.163139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,1,0,0,0,0,0


In [16]:
y.head()

12883    11
12884    11
12885    11
12886    11
12887    11
Name: sub_label, dtype: int64

In [17]:
# step 7: transform y

# transform value in column 'sub_label'
# 11 = 1
# 12 = 0

#create mapping value
mapping_value = {11:1, 12:0}

#map value on column 'sub_label'
y = y.map(mapping_value)

#show result 
y.value_counts()

sub_label
0    271
1    206
Name: count, dtype: int64

# **----- Use the trained model to test the data -----**

In [18]:
# the trained model to test the processed data X_sm and y_sm
y_pred = pantip_post_model_train_simplified_sublabel_lr.predict(X_all)

In [19]:
#evaluate the test model using the metrics including accuracy score, precision score, recall score, f1 score, roc auc score, confusion matrix as well as classification report, roc curve
print('accuracy score: ', accuracy_score(y, y_pred))
print('precision score: ', precision_score(y, y_pred))
print('recall score: ', recall_score(y, y_pred))
print('f1 score: ', f1_score(y, y_pred))
print('roc auc score: ', roc_auc_score(y, y_pred))
print('confusion matrix: ', confusion_matrix(y, y_pred))
print('classification report: ', classification_report(y, y_pred))


accuracy score:  0.7651991614255765
precision score:  0.7175925925925926
recall score:  0.7524271844660194
f1 score:  0.7345971563981043
roc auc score:  0.763667466771755
confusion matrix:  [[210  61]
 [ 51 155]]
classification report:                precision    recall  f1-score   support

           0       0.80      0.77      0.79       271
           1       0.72      0.75      0.73       206

    accuracy                           0.77       477
   macro avg       0.76      0.76      0.76       477
weighted avg       0.77      0.77      0.77       477

