In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# import pandas as pd
# from distributed import Client, LocalCluster
# cluster = LocalCluster(memory_limit='8GB')
# client = Client(cluster)
import modin.pandas as pd
import modin.config as modin_cfg
modin_cfg.Engine.put("ray")  # Modin will use Ray
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
# kaggle_data = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/kaggle_data.csv')
# wiki_data = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/wiki_data.csv')
# essay = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/competition_essay.csv')
# test_data = essay.drop(columns='Remark')

train_data_processed = pd.read_csv("train_preprocessed_v2.csv")
test_data_processed = pd.read_csv("new_essay_val_preprocessed.csv")
test_data_dep_processed = pd.read_csv("test_data_preprocessed.csv")

2024-05-05 15:20:33,235	INFO worker.py:1752 -- Started a local Ray instance.


In [4]:
# for each data, keep only text and label columns
train_data_processed = train_data_processed[['text', 'label']]
test_data_processed = test_data_processed[['text', 'label']]
test_data_dep_processed = test_data_dep_processed[['text', 'label']]

train_data_processed.head()

Unnamed: 0,text,label
0,school homework clubs become increasingly popu...,1.0
1,widely accepted knowledge great source power s...,1.0
2,first impressions great power shape interactio...,1.0
3,name address city state zip code email address...,1.0
4,limiting car usage numerous advantages benefit...,1.0


## Data Preprocessing

In [5]:
# Data Preprocessing
# !pip install --upgrade pip
# !pip install transformers
# !pip install ftfy
# !pip install ax-platform

In [6]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import utils
import gensim.models
from ftfy import fix_text
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiajiazhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jiajiazhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def data_preprocessing(df):
    # Remove rows with any missing values
    df = df.dropna()
    # Drop duplicates where both 'text' and 'label' are the same
    df = df.drop_duplicates(subset=['text', 'label'])
    # Drop all entries where 'text' appears more than once (across different labels)
    df = df.drop_duplicates(subset='text', keep=False)
    return df

In [8]:
train_data = data_preprocessing(train_data_processed)
test_data = data_preprocessing(test_data_processed)
test_dep_data = data_preprocessing(test_data_dep_processed)

In [9]:
# combine the text into corpus
df_list = [train_data, test_data]
text_corpus = pd.concat(df_list)
text_corpus.tail()

Unnamed: 0,text,label
35,wake mohamed bouazizis desperate act selfimmol...,1.0
36,utilitarianism emphasis maximizing overall uti...,1.0
37,rich countries long grappled question meaningf...,1.0
38,literature change anything question whether ti...,1.0
39,nurturing stability prosperity strengthening u...,1.0


In [10]:
# define meta feature function
class TextFeatureExtractor:
    def __init__(self):
        self.stopwords = set(stopwords.words('english'))

    def transform(self, df):
        # Store original columns to keep after transformation
        original_columns = df.columns.tolist()
        # Compute various text-related features
        df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
        df['unique_word_count'] = df['text'].apply(lambda x: len(set(str(x).split())))
        df['%unique_word_total']= df['unique_word_count']/df['word_count']
        df['stop_word_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in self.stopwords]))
        df['%stop_word_total']=df['stop_word_count']/df['word_count']
        df['mean_word_length'] = df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
        df['char_count'] = df['text'].apply(lambda x: len(str(x)))
        df['mean_char_count_per_word']=df['char_count']/df['word_count']
        columns_to_keep = original_columns + ['%unique_word_total', '%stop_word_total','mean_word_length', 'mean_char_count_per_word']
        df = df[columns_to_keep]
        return df

In [11]:
meta_feature_list = ['word_count', 'unique_word_count', 'stop_word_count', 'url_count', 'mean_word_length', 'char_count']
text_feature_extractor = TextFeatureExtractor()

train_data = text_feature_extractor.transform(train_data)
test_data = text_feature_extractor.transform(test_data)
test_dep_data = text_feature_extractor.transform(test_dep_data)

In [12]:
def normalise_text(text):
    text = fix_text(text)
    text = text.lower()  # lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))    # remove punctuation
    text = re.sub(r'\s{2,}', ' ', text)   # replace more than or equal to two white spaces into one white space.
    return text

In [13]:
# fix text
text_corpus['text'] = text_corpus['text'].apply(lambda text: fix_text(text))
# normalise text
text_corpus['text'] = text_corpus['text'].apply(lambda text: normalise_text(text))

In [14]:
text_corpus.head()

Unnamed: 0,text,label
0,school homework clubs become increasingly popu...,1.0
1,widely accepted knowledge great source power s...,1.0
2,first impressions great power shape interactio...,1.0
3,name address city state zip code email address...,1.0
4,limiting car usage numerous advantages benefit...,1.0


## Word2Vec + Logistic Regression

In [15]:
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

# Splitting data back into training and test datasets
test_data = test_data  # already defined in the previous code
train_data.shape, test_data.shape
train_data.head()

Unnamed: 0,text,label,%unique_word_total,%stop_word_total,mean_word_length,mean_char_count_per_word
0,school homework clubs become increasingly popu...,1.0,0.625,0.0,6.8125,7.802083
1,widely accepted knowledge great source power s...,1.0,0.642857,0.0,6.980519,7.974026
2,first impressions great power shape interactio...,1.0,0.803738,0.0,7.168224,8.158879
3,name address city state zip code email address...,1.0,0.59292,0.0,7.274336,8.271386
4,limiting car usage numerous advantages benefit...,1.0,0.632867,0.0,7.202797,8.199301


In [16]:
# Tokenize the text in each dataset
train_data['tokenized'] = train_data['text'].apply(word_tokenize)
test_data['tokenized'] = test_data['text'].apply(word_tokenize)
test_dep_data['tokenized'] = test_dep_data['text'].apply(word_tokenize)

In [17]:
# Train a Word2Vec model
model_w2v = Word2Vec(sentences=train_data['tokenized'], vector_size=100, window=5, min_count=1, workers=6)

# Convert text to a mean vector
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)

In [18]:
train_data['doc_vector'] = train_data['tokenized'].apply(lambda x: document_vector(model_w2v, x))
test_data['doc_vector'] = test_data['tokenized'].apply(lambda x: document_vector(model_w2v, x))
test_dep_data['doc_vector'] = test_dep_data['tokenized'].apply(lambda x: document_vector(model_w2v, x))
train_data.head()

# save the train and test data as pickle file
# train_data.to_pickle('train_data.pkl')
# test_data.to_pickle('test_data.pkl')

Unnamed: 0,text,label,%unique_word_total,%stop_word_total,mean_word_length,mean_char_count_per_word,tokenized,doc_vector
0,school homework clubs become increasingly popu...,1.0,0.625,0.0,6.8125,7.802083,"[school, homework, clubs, become, increasingly...","[0.57162076, -0.06686906, 0.72704774, 0.841681..."
1,widely accepted knowledge great source power s...,1.0,0.642857,0.0,6.980519,7.974026,"[widely, accepted, knowledge, great, source, p...","[0.6422472, -0.5788288, 1.2735754, 0.17931908,..."
2,first impressions great power shape interactio...,1.0,0.803738,0.0,7.168224,8.158879,"[first, impressions, great, power, shape, inte...","[0.5557356, -0.5109672, 1.1807613, -0.03543265..."
3,name address city state zip code email address...,1.0,0.59292,0.0,7.274336,8.271386,"[name, address, city, state, zip, code, email,...","[-0.67589957, 0.2583637, 0.5094211, -0.0832677..."
4,limiting car usage numerous advantages benefit...,1.0,0.632867,0.0,7.202797,8.199301,"[limiting, car, usage, numerous, advantages, b...","[0.59118575, 0.024846703, 0.9679171, 1.8717972..."


In [19]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.metrics import precision_recall_curve, precision_score, recall_score

# load pickles to save time from running all above code
# train_data = pd.read_pickle('train_data.pkl')
# test_data = pd.read_pickle('test_data.pkl')

# Extract feature vectors for training and testing
X_train = np.array(list(train_data['doc_vector']))
X_test = np.array(list(test_data['doc_vector']))
X_test_dep = np.array(list(test_dep_data['doc_vector']))
y_train = train_data['label'].values
y_test = test_data['label'].values
y_test_dep = test_dep_data['label'].values

In [20]:
# Train a logistic regression model
log_reg = LogisticRegression(max_iter=50000) # set max_iter=50000 as the model does not converge with default value
log_reg.fit(X_train, y_train)
import pickle

# save the model to disk
filename = 'logisticregression_model_trainv2.sav'
pickle.dump(log_reg, open(filename, 'wb'))

# Predictions and evaluations
y_pred = log_reg.predict(X_test)
print("Logistic Regression - Classification Report:")
print(classification_report(y_test, y_pred))
print("Logistic Regression - Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print('AUC:', roc_auc_score(y_test, y_pred))
# print precision, recall, and precision-recall curve
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

# print("\n=========== test_dep =================\n")

# y_pred_dep = log_reg.predict(X_test_dep)
# print("Logistic Regression - Classification Report:")
# print(classification_report(y_test_dep, y_pred_dep))
# print("Logistic Regression - Confusion Matrix:")
# print(confusion_matrix(y_test_dep, y_pred_dep))
# print('AUC:', roc_auc_score(y_test_dep, y_pred_dep))


Logistic Regression - Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.45      0.55        20
           1       0.59      0.80      0.68        20

    accuracy                           0.62        40
   macro avg       0.64      0.62      0.61        40
weighted avg       0.64      0.62      0.61        40

Logistic Regression - Confusion Matrix:
[[ 9 11]
 [ 4 16]]
AUC: 0.625
Precision: 0.5925925925925926
Recall: 0.8


In [21]:
# # Train a LightGBM model
# # choose LightGBM over SVC & RandomForest as it is more scalable and faster to train
# import lightgbm as lgb
# lgbm = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05, max_depth=-1)
# lgbm.fit(X_train, y_train)

# # Predictions and evaluations
# y_pred = lgbm.predict(X_test)
# print("LightGBM - Classification Report:")
# print(classification_report(y_test, y_pred))
# print("LightGBM - Confusion Matrix:")
# print(confusion_matrix(y_test, y_pred))
# print('AUC:', roc_auc_score(y_test, y_pred))

# print("\n=========== test_dep =================\n")

# y_pred_dep = lgbm.predict(X_test_dep)
# print("LightGBM - Classification Report:")
# print(classification_report(y_test_dep, y_pred_dep))
# print("LightGBM - Confusion Matrix:")
# print(confusion_matrix(y_test_dep, y_pred_dep))
# print('AUC:', roc_auc_score(y_test_dep, y_pred_dep))


In [22]:
# # Train a CatBoost model
# from catboost import CatBoostClassifier
# catboost = CatBoostClassifier(iterations=100, learning_rate=0.05, depth=5)
# catboost.fit(X_train, y_train)

# # Predictions and evaluations
# y_pred = catboost.predict(X_test)
# print("CatBoost - Classification Report:")
# print(classification_report(y_test, y_pred))
# print("CatBoost - Confusion Matrix:")
# print(confusion_matrix(y_test, y_pred))
# print('AUC:', roc_auc_score(y_test, y_pred))

# print("\n=========== test_dep =================\n")

# y_pred_dep = catboost.predict(X_test_dep)
# print("CatBoost - Classification Report:")
# print(classification_report(y_test_dep, y_pred_dep))
# print("CatBoost - Confusion Matrix:")
# print(confusion_matrix(y_test_dep, y_pred_dep))
# print('AUC:', roc_auc_score(y_test_dep, y_pred_dep))
