In [10]:
# from google.colab import drive
# drive.mount('/content/drive')

In [11]:
# import pandas as pd
# from distributed import Client, LocalCluster
# cluster = LocalCluster(memory_limit='8GB')
# client = Client(cluster)
import modin.pandas as pd
import modin.config as modin_cfg
modin_cfg.Engine.put("ray")  # Modin will use Ray
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [12]:
# kaggle_data = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/kaggle_data.csv')
# wiki_data = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/wiki_data.csv')
# essay = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/competition_essay.csv')
# test_data = essay.drop(columns='Remark')

train_data_processed = pd.read_csv("kaggle_preprocessed.csv")
test_data_processed = pd.read_csv("new_essay_val_preprocessed.csv")
test_data_dep_processed = pd.read_csv("test_data_preprocessed.csv")

In [13]:
# for each data, keep only text and label columns
train_data_processed = train_data_processed[['text', 'label']]
test_data_processed = test_data_processed[['text', 'label']]
test_data_dep_processed = test_data_dep_processed[['text', 'label']]

train_data_processed.head()

Unnamed: 0,text,label
0,carfree cities become subject increasing inter...,1
1,car free cities carfree cities concept gaining...,1
2,sustainable urban future carfree cities emergi...,1
3,pioneering sustainable urban living era marked...,1
4,path sustainable urban living age rapid urbani...,1


## Data Preprocessing

In [14]:
# Data Preprocessing
# !pip install --upgrade pip
# !pip install transformers
# !pip install ftfy
# !pip install ax-platform



In [15]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import utils
import gensim.models
from ftfy import fix_text
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/senyuuri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/senyuuri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
def data_preprocessing(df):
    # Remove rows with any missing values
    df = df.dropna()
    # Drop duplicates where both 'text' and 'label' are the same
    df = df.drop_duplicates(subset=['text', 'label'])
    # Drop all entries where 'text' appears more than once (across different labels)
    df = df.drop_duplicates(subset='text', keep=False)
    return df

In [17]:
train_data = data_preprocessing(train_data_processed)
test_data = data_preprocessing(test_data_processed)
test_dep_data = data_preprocessing(test_data_dep_processed)

In [18]:
# combine the text into corpus
df_list = [train_data, test_data]
text_corpus = pd.concat(df_list)
text_corpus.tail()

Unnamed: 0,text,label
35,wake mohamed bouazizis desperate act selfimmol...,1
36,utilitarianism emphasis maximizing overall uti...,1
37,rich countries long grappled question meaningf...,1
38,literature change anything question whether ti...,1
39,nurturing stability prosperity strengthening u...,1


In [19]:
# define meta feature function
class TextFeatureExtractor:
    def __init__(self):
        self.stopwords = set(stopwords.words('english'))

    def transform(self, df):
        # Store original columns to keep after transformation
        original_columns = df.columns.tolist()
        # Compute various text-related features
        df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
        df['unique_word_count'] = df['text'].apply(lambda x: len(set(str(x).split())))
        df['%unique_word_total']= df['unique_word_count']/df['word_count']
        df['stop_word_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in self.stopwords]))
        df['%stop_word_total']=df['stop_word_count']/df['word_count']
        df['mean_word_length'] = df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
        df['char_count'] = df['text'].apply(lambda x: len(str(x)))
        df['mean_char_count_per_word']=df['char_count']/df['word_count']
        columns_to_keep = original_columns + ['%unique_word_total', '%stop_word_total','mean_word_length', 'mean_char_count_per_word']
        df = df[columns_to_keep]
        return df

In [21]:
meta_feature_list = ['word_count', 'unique_word_count', 'stop_word_count', 'url_count', 'mean_word_length', 'char_count']
text_feature_extractor = TextFeatureExtractor()

train_data = text_feature_extractor.transform(train_data)
test_data = text_feature_extractor.transform(test_data)
test_dep_data = text_feature_extractor.transform(test_dep_data)

In [22]:
def normalise_text(text):
    text = fix_text(text)
    text = text.lower()  # lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))    # remove punctuation
    text = re.sub(r'\s{2,}', ' ', text)   # replace more than or equal to two white spaces into one white space.
    return text

In [23]:
# fix text
text_corpus['text'] = text_corpus['text'].apply(lambda text: fix_text(text))
# normalise text
text_corpus['text'] = text_corpus['text'].apply(lambda text: normalise_text(text))

In [24]:
text_corpus.head()

Unnamed: 0,text,label
0,carfree cities become subject increasing inter...,1
1,car free cities carfree cities concept gaining...,1
2,sustainable urban future carfree cities emergi...,1
3,pioneering sustainable urban living era marked...,1
4,path sustainable urban living age rapid urbani...,1


## Word2Vec + Logistic Regression

In [25]:
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

# Splitting data back into training and test datasets
test_data = test_data  # already defined in the previous code
train_data.shape, test_data.shape
train_data.head()

Unnamed: 0,text,label,%unique_word_total,%stop_word_total,mean_word_length,mean_char_count_per_word
0,carfree cities become subject increasing inter...,1,0.609499,0.0,7.627968,8.62533
1,car free cities carfree cities concept gaining...,1,0.652542,0.0,7.632768,8.629944
2,sustainable urban future carfree cities emergi...,1,0.605042,0.0,7.77591,8.773109
3,pioneering sustainable urban living era marked...,1,0.647399,0.0,7.679191,8.676301
4,path sustainable urban living age rapid urbani...,1,0.622857,0.0,7.645714,8.642857


In [26]:
# Tokenize the text in each dataset
train_data['tokenized'] = train_data['text'].apply(word_tokenize)
test_data['tokenized'] = test_data['text'].apply(word_tokenize)
test_dep_data['tokenized'] = test_dep_data['text'].apply(word_tokenize)

In [27]:
# Train a Word2Vec model
model_w2v = Word2Vec(sentences=train_data['tokenized'], vector_size=100, window=5, min_count=1, workers=6)

# Convert text to a mean vector
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)

In [28]:
train_data['doc_vector'] = train_data['tokenized'].apply(lambda x: document_vector(model_w2v, x))
test_data['doc_vector'] = test_data['tokenized'].apply(lambda x: document_vector(model_w2v, x))
test_dep_data['doc_vector'] = test_dep_data['tokenized'].apply(lambda x: document_vector(model_w2v, x))
train_data.head()

# save the train and test data as pickle file
# train_data.to_pickle('train_data.pkl')
# test_data.to_pickle('test_data.pkl')

Unnamed: 0,text,label,%unique_word_total,%stop_word_total,mean_word_length,mean_char_count_per_word,tokenized,doc_vector
0,carfree cities become subject increasing inter...,1,0.609499,0.0,7.627968,8.62533,"[carfree, cities, become, subject, increasing,...","[-0.014706382, -0.5769689, -0.8302832, 0.54603..."
1,car free cities carfree cities concept gaining...,1,0.652542,0.0,7.632768,8.629944,"[car, free, cities, carfree, cities, concept, ...","[-0.054762553, -0.52706367, -0.80999386, 0.430..."
2,sustainable urban future carfree cities emergi...,1,0.605042,0.0,7.77591,8.773109,"[sustainable, urban, future, carfree, cities, ...","[0.04172768, -0.54994303, -0.74033755, 0.37289..."
3,pioneering sustainable urban living era marked...,1,0.647399,0.0,7.679191,8.676301,"[pioneering, sustainable, urban, living, era, ...","[-0.014413038, -0.53128386, -0.66398543, 0.356..."
4,path sustainable urban living age rapid urbani...,1,0.622857,0.0,7.645714,8.642857,"[path, sustainable, urban, living, age, rapid,...","[-0.01715994, -0.5561988, -0.68364155, 0.43044..."


In [34]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, mean_squared_error

# load pickles to save time from running all above code
# train_data = pd.read_pickle('train_data.pkl')
# test_data = pd.read_pickle('test_data.pkl')

# Extract feature vectors for training and testing
X_train = np.array(list(train_data['doc_vector']))
X_test = np.array(list(test_data['doc_vector']))
X_test_dep = np.array(list(test_dep_data['doc_vector']))
y_train = train_data['label'].values
y_test = test_data['label'].values
y_test_dep = test_dep_data['label'].values

In [35]:
# Train a logistic regression model
log_reg = LogisticRegression(max_iter=50000) # set max_iter=50000 as the model does not converge with default value
log_reg.fit(X_train, y_train)

# Predictions and evaluations
y_pred = log_reg.predict(X_test)
print("Logistic Regression - Classification Report:")
print(classification_report(y_test, y_pred))
print("Logistic Regression - Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print('AUC:', roc_auc_score(y_test, y_pred))

print("\n=========== test_dep =================\n")

y_pred_dep = log_reg.predict(X_test_dep)
print("Logistic Regression - Classification Report:")
print(classification_report(y_test_dep, y_pred_dep))
print("Logistic Regression - Confusion Matrix:")
print(confusion_matrix(y_test_dep, y_pred_dep))
print('AUC:', roc_auc_score(y_test_dep, y_pred_dep))


Logistic Regression - Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.20      0.31        20
           1       0.53      0.90      0.67        20

    accuracy                           0.55        40
   macro avg       0.60      0.55      0.49        40
weighted avg       0.60      0.55      0.49        40

Logistic Regression - Confusion Matrix:
[[ 4 16]
 [ 2 18]]
AUC: 0.55


Logistic Regression - Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.84      0.86        25
           1       0.85      0.88      0.86        25

    accuracy                           0.86        50
   macro avg       0.86      0.86      0.86        50
weighted avg       0.86      0.86      0.86        50

Logistic Regression - Confusion Matrix:
[[21  4]
 [ 3 22]]
AUC: 0.86


In [37]:
# Train a LightGBM model
# choose LightGBM over SVC & RandomForest as it is more scalable and faster to train
import lightgbm as lgb
lgbm = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05, max_depth=-1)
lgbm.fit(X_train, y_train)

# Predictions and evaluations
y_pred = lgbm.predict(X_test)
print("LightGBM - Classification Report:")
print(classification_report(y_test, y_pred))
print("LightGBM - Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print('AUC:', roc_auc_score(y_test, y_pred))

print("\n=========== test_dep =================\n")

y_pred_dep = lgbm.predict(X_test_dep)
print("LightGBM - Classification Report:")
print(classification_report(y_test_dep, y_pred_dep))
print("LightGBM - Confusion Matrix:")
print(confusion_matrix(y_test_dep, y_pred_dep))
print('AUC:', roc_auc_score(y_test_dep, y_pred_dep))


[LightGBM] [Info] Number of positive: 11183, number of negative: 16035
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 27218, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.410868 -> initscore=-0.360379
[LightGBM] [Info] Start training from score -0.360379
LightGBM - Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.05      0.10        20
           1       0.51      1.00      0.68        20

    accuracy                           0.53        40
   macro avg       0.76      0.53      0.39        40
weighted avg       0.76      0.53      0.39        40

LightGBM - Confusion Matrix:
[[ 1 19]
 [ 0 20]]
AUC: 0.525


LightGBM - Classification Report:
              precision    recall  f1-score   su

In [41]:
# Train a CatBoost model
from catboost import CatBoostClassifier
catboost = CatBoostClassifier(iterations=100, learning_rate=0.05, depth=5)
catboost.fit(X_train, y_train)

# Predictions and evaluations
y_pred = catboost.predict(X_test)
print("CatBoost - Classification Report:")
print(classification_report(y_test, y_pred))
print("CatBoost - Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print('AUC:', roc_auc_score(y_test, y_pred))

print("\n=========== test_dep =================\n")

y_pred_dep = catboost.predict(X_test_dep)
print("CatBoost - Classification Report:")
print(classification_report(y_test_dep, y_pred_dep))
print("CatBoost - Confusion Matrix:")
print(confusion_matrix(y_test_dep, y_pred_dep))
print('AUC:', roc_auc_score(y_test_dep, y_pred_dep))


0:	learn: 0.6250764	total: 26.4ms	remaining: 2.61s
1:	learn: 0.5694539	total: 35.1ms	remaining: 1.72s
2:	learn: 0.5200941	total: 42.2ms	remaining: 1.36s
3:	learn: 0.4807498	total: 50.2ms	remaining: 1.21s
4:	learn: 0.4444753	total: 56.2ms	remaining: 1.07s
5:	learn: 0.4096700	total: 62.3ms	remaining: 976ms
6:	learn: 0.3833262	total: 67.7ms	remaining: 900ms
7:	learn: 0.3596298	total: 73.5ms	remaining: 846ms
8:	learn: 0.3405934	total: 80.1ms	remaining: 810ms
9:	learn: 0.3198184	total: 85.5ms	remaining: 770ms
10:	learn: 0.3051654	total: 91.4ms	remaining: 740ms
11:	learn: 0.2883879	total: 96.7ms	remaining: 709ms
12:	learn: 0.2778217	total: 102ms	remaining: 682ms
13:	learn: 0.2645782	total: 107ms	remaining: 659ms
14:	learn: 0.2530048	total: 113ms	remaining: 640ms
15:	learn: 0.2432794	total: 118ms	remaining: 619ms
16:	learn: 0.2355100	total: 123ms	remaining: 601ms
17:	learn: 0.2268649	total: 129ms	remaining: 588ms
18:	learn: 0.2175869	total: 134ms	remaining: 573ms
19:	learn: 0.2104616	total: 1