In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# import pandas as pd
# from distributed import Client, LocalCluster
# cluster = LocalCluster(memory_limit='8GB')
# client = Client(cluster)
import modin.pandas as pd
import modin.config as modin_cfg
modin_cfg.Engine.put("ray")  # Modin will use Ray

import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
# kaggle_data = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/kaggle_data.csv')
# wiki_data = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/wiki_data.csv')
# essay = pd.read_csv('/content/drive/MyDrive/NUS_MSBA/BT5153_Final Group Project_Shared Folder/Data/competition_essay.csv')
# test_data = essay.drop(columns='Remark')

kaggle_data = pd.read_csv('kaggle_data.csv')
wiki_data = pd.read_csv('wiki_data.csv')
essay = pd.read_csv('competition_essay.csv')
test_data = essay.drop(columns='Remark')

2024-04-16 22:31:16,726	INFO worker.py:1752 -- Started a local Ray instance.


In [4]:
kaggle_data.head()

Unnamed: 0,text,label
0,Car-free cities have become a subject of incre...,1
1,"Car Free Cities Car-free cities, a concept ga...",1
2,A Sustainable Urban Future Car-free cities ...,1
3,Pioneering Sustainable Urban Living In an e...,1
4,The Path to Sustainable Urban Living In an ...,1


In [5]:
wiki_data.head()

Unnamed: 0,text,label
0,Sexhow railway station was a railway station b...,0.0
1,"In Finnish folklore, all places and things, an...",0.0
2,"In mathematics, specifically differential calc...",0.0
3,is a Japanese shōjo manga series written and i...,0.0
4,"Robert Milner ""Rob"" Bradley, Jr. (born August ...",0.0


In [6]:
test_data.head()


Unnamed: 0,Essay,Label
0,"In this new century, there are a daunting numb...",0
1,Many associate the word courage with a comic b...,0
2,"In the unfolding story of the 21st century, ou...",1
3,Courage often conjures images of gallant knigh...,1
4,Americans have rewritten the Declaration of In...,0


In [7]:
test_data.columns=['text','label']
test_data.head()

Unnamed: 0,text,label
0,"In this new century, there are a daunting numb...",0
1,Many associate the word courage with a comic b...,0
2,"In the unfolding story of the 21st century, ou...",1
3,Courage often conjures images of gallant knigh...,1
4,Americans have rewritten the Declaration of In...,0


## Data Preprocessing

In [8]:
# Data Preprocessing
# !pip install --upgrade pip
# !pip install transformers
# !pip install ftfy
# !pip install ax-platform

In [9]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import utils
import gensim.models
from ftfy import fix_text
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiajiazhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jiajiazhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
def data_preprocessing(df):
    # Remove rows with any missing values
    df = df.dropna()
    # Drop duplicates where both 'text' and 'label' are the same
    df = df.drop_duplicates(subset=['text', 'label'])
    # Drop all entries where 'text' appears more than once (across different labels)
    df = df.drop_duplicates(subset='text', keep=False)
    return df

In [11]:
kaggle_data = data_preprocessing(kaggle_data)
wiki_data = data_preprocessing(wiki_data)
test_data = data_preprocessing(test_data)

In [12]:
# combine the text into corpus
df_list = [kaggle_data, wiki_data, test_data]
text_corpus = pd.concat(df_list)
text_corpus.tail()

Unnamed: 0,text,label
45,Amidst the bustling corridors of power in Wash...,1
46,On an unusually warm Saturday in April 2003 at...,0
47,On the first blush of a crisp autumn morning i...,1
48,"U.S. President John F. Kennedy told us, “A nat...",0
49,"In the swirling currents of political life, wh...",1


In [13]:
# define meta feature function
class TextFeatureExtractor:
    def __init__(self):
        self.stopwords = set(stopwords.words('english'))

    def transform(self, df):
        # Store original columns to keep after transformation
        original_columns = df.columns.tolist()
        # Compute various text-related features
        df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
        df['unique_word_count'] = df['text'].apply(lambda x: len(set(str(x).split())))
        df['%unique_word_total']= df['unique_word_count']/df['word_count']
        df['stop_word_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in self.stopwords]))
        df['%stop_word_total']=df['stop_word_count']/df['word_count']
        df['mean_word_length'] = df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
        df['char_count'] = df['text'].apply(lambda x: len(str(x)))
        df['mean_char_count_per_word']=df['char_count']/df['word_count']
        columns_to_keep = original_columns + ['%unique_word_total', '%stop_word_total','mean_word_length', 'mean_char_count_per_word']
        df = df[columns_to_keep]
        return df

In [14]:
meta_feature_list = ['word_count', 'unique_word_count', 'stop_word_count', 'url_count', 'mean_word_length', 'char_count']
text_feature_extractor = TextFeatureExtractor()
kaggle_data = text_feature_extractor.transform(kaggle_data)
wiki_data = text_feature_extractor.transform(wiki_data)
test_data = text_feature_extractor.transform(test_data)

In [15]:
def normalise_text(text):
    text = fix_text(text)
    text = text.lower()  # lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))    # remove punctuation
    text = re.sub(r'\s{2,}', ' ', text)   # replace more than or equal to two white spaces into one white space.
    return text

In [16]:
# fix text
text_corpus['text'] = text_corpus['text'].apply(lambda text: fix_text(text))
# normalise text
text_corpus['text'] = text_corpus['text'].apply(lambda text: normalise_text(text))

In [17]:
text_corpus.head()

Unnamed: 0,text,label
0,carfree cities have become a subject of increa...,1.0
1,car free cities carfree cities a concept gaini...,1.0
2,a sustainable urban future carfree cities are...,1.0
3,pioneering sustainable urban living in an era...,1.0
4,the path to sustainable urban living in an ag...,1.0


## Word2Vec + Logistic Regression

In [18]:
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

# Splitting data back into training and test datasets
train_data = pd.concat([kaggle_data, wiki_data])
test_data = test_data  # already defined in the previous code
train_data.shape, test_data.shape
train_data.head()

Unnamed: 0,text,label,%unique_word_total,%stop_word_total,mean_word_length,mean_char_count_per_word
0,Car-free cities have become a subject of incre...,1.0,0.573913,0.34087,6.083478,7.114783
1,"Car Free Cities Car-free cities, a concept ga...",1.0,0.623782,0.309942,6.298246,7.323587
2,A Sustainable Urban Future Car-free cities ...,1.0,0.580078,0.302734,6.445312,7.476562
3,Pioneering Sustainable Urban Living In an e...,1.0,0.585742,0.333333,6.175337,7.204239
4,The Path to Sustainable Urban Living In an ...,1.0,0.579256,0.315068,6.207436,7.236791


In [19]:
# Tokenize the text in each dataset
train_data['tokenized'] = train_data['text'].apply(word_tokenize)
test_data['tokenized'] = test_data['text'].apply(word_tokenize)

In [20]:
# Train a Word2Vec model
model_w2v = Word2Vec(sentences=train_data['tokenized'], vector_size=100, window=5, min_count=1, workers=6)

# Convert text to a mean vector
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)

In [21]:
train_data['doc_vector'] = train_data['tokenized'].apply(lambda x: document_vector(model_w2v, x))
test_data['doc_vector'] = test_data['tokenized'].apply(lambda x: document_vector(model_w2v, x))
train_data.head()

# save the train and test data as pickle file
# train_data.to_pickle('train_data.pkl')
# test_data.to_pickle('test_data.pkl')

[36m(raylet)[0m Spilled 2232 MiB, 159 objects, write throughput 830 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.


In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# load pickles to save time from running all above code
train_data = pd.read_pickle('train_data.pkl')
test_data = pd.read_pickle('test_data.pkl')

# Extract feature vectors for training and testing
X_train = np.array(list(train_data['doc_vector']))
X_test = np.array(list(test_data['doc_vector']))
y_train = train_data['label'].values
y_test = test_data['label'].values

In [10]:
# Train a logistic regression model
log_reg = LogisticRegression(max_iter=50000) # set max_iter=50000 as the model does not converge with default value
log_reg.fit(X_train, y_train)

# Predictions and evaluations
y_pred = log_reg.predict(X_test)
print("Logistic Regression - Classification Report:")
print(classification_report(y_test, y_pred))
print("Logistic Regression - Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Logistic Regression - Classification Report:
              precision    recall  f1-score   support

           0       0.51      1.00      0.68        25
           1       0.00      0.00      0.00        24

    accuracy                           0.51        49
   macro avg       0.26      0.50      0.34        49
weighted avg       0.26      0.51      0.34        49

Logistic Regression - Confusion Matrix:
[[25  0]
 [24  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Train a Naive Bayes model
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predictions and evaluations
y_pred = gnb.predict(X_test)
print("Naive Bayes - Classification Report:")
print(classification_report(y_test, y_pred))
print("Naive Bayes - Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Naive Bayes - Classification Report:
              precision    recall  f1-score   support

           0       0.51      1.00      0.68        25
           1       0.00      0.00      0.00        24

    accuracy                           0.51        49
   macro avg       0.26      0.50      0.34        49
weighted avg       0.26      0.51      0.34        49

Naive Bayes - Confusion Matrix:
[[25  0]
 [24  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Train a LightGBM model
# choose LightGBM over SVC & RandomForest as it is more scalable and faster to train
import lightgbm as lgb
lgbm = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05, max_depth=-1)
lgbm.fit(X_train, y_train)

# Predictions and evaluations
y_pred = lgbm.predict(X_test)
print("LightGBM - Classification Report:")
print(classification_report(y_test, y_pred))
print("LightGBM - Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

LightGBM - Classification Report:
              precision    recall  f1-score   support

           0       0.52      1.00      0.68        25
           1       1.00      0.04      0.08        24

    accuracy                           0.53        49
   macro avg       0.76      0.52      0.38        49
weighted avg       0.76      0.53      0.39        49

LightGBM - Confusion Matrix:
[[25  0]
 [23  1]]
