In [1]:
from google.colab import drive
drive.mount('/content/drive')#connect with the google drive to get the data
import pandas as pd

# Load dataset
mbti_df = pd.read_csv('/content/drive/MyDrive/Comp8535_Group_Project-gennie-dev/mbti_1.csv')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 1. Text normalization
# (i) convert to lowercase;
# (ii) remove url;
# (iii) remove numbers;
# (iv) remove non-alphanumeric characters (punctuation, special characters);
# (v) remove underscores and signs;
# (vi) replace multiple spaces with single spaces;
# (vii) remove stopwords;
# (viii) remove one-letter words;
def clean_text(text):
    text = str(text)
    text = text.lower()
    pattern = re.compile(r'https?://[a-zA-Z0-9./-]*/[a-zA-Z0-9?=_.]*[_0-9.a-zA-Z/-]*')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'[0-9]')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'\W+')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'[_+]')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'\s+')
    text = re.sub(pattern, ' ', text).strip()
    stop_words = stopwords.words("english")
    text = " ".join([w for w in text.split() if w not in stop_words])
    text = ' '.join([word for word in text.split() if len(word) > 1])
    return text

In [3]:
# 2. Lemmatization
# (i) use NLTK's lemmatizer
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [4]:
# 3. Select classification dimension (16-class classification or binary classification on single dimension)
def select_classification_dimension(df, num=0):
    if num == 16:
        return df['type']
    elif 1 <= num <= 4:
        return df['type'].str[num-1]
    else:
        print("selection error of classification dimension!")
        return df['type']

In [5]:
# 4. Label encoding
# (i) creates an array corresponding to the type labels.
def encode_labels(column):
    le = LabelEncoder()
    y = le.fit_transform(column)
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    print(f"Label encoding mapping: {mapping}")
    print(f"Encoded label examples: {y[:5]}")
    print(f"Unique encoded labels: {np.unique(y)}")
    return y, le

In [6]:
mbti_df["posts"] = mbti_df["posts"].str.lower()       #converts text in posts to lowercase as it is preferred in nlp

In [7]:
for i in range(len(mbti_df)):
  post_temp=mbti_df._get_value(i, 'posts')
  pattern = re.compile(r'https?://[a-zA-Z0-9./-]*/[a-zA-Z0-9?=_.]*[_0-9.a-zA-Z/-]*')    #to match url links present in the post
  post_temp= re.sub(pattern, ' ', post_temp)                                            #to replace that url link with space
  mbti_df._set_value(i, 'posts',post_temp)

In [8]:
for i in range(len(mbti_df)):
  post_temp=mbti_df._get_value(i, 'posts')
  pattern = re.compile(r'[0-9]')                                    #to match numbers from 0 to 9
  post_temp= re.sub(pattern, ' ', post_temp)                        #to replace them with space
  pattern = re.compile('\W+')                                       #to match alphanumeric characters
  post_temp= re.sub(pattern, ' ', post_temp)                        #to replace them with space
  pattern = re.compile(r'[_+]')
  post_temp= re.sub(pattern, ' ', post_temp)
  mbti_df._set_value(i, 'posts',post_temp)

In [9]:
for i in range(len(mbti_df)):
  post_temp=mbti_df._get_value(i, 'posts')
  pattern = re.compile('\s+')                                     #to match multiple whitespaces
  post_temp= re.sub(pattern, ' ', post_temp)                      #to replace them with single whitespace
  mbti_df._set_value(i, 'posts', post_temp)


In [10]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
remove_words = stopwords.words("english")
for i in range(mbti_df.shape[0]):
  post_temp=mbti_df._get_value(i, 'posts')
  post_temp=" ".join([w for w in post_temp.split(' ') if w not in remove_words])    #to remove stopwords
  mbti_df._set_value(i, 'posts', post_temp)

In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
for i in range(mbti_df.shape[0]):
  post_temp=mbti_df._get_value(i, 'posts')
  post_temp=" ".join([lemmatizer.lemmatize(w) for w in post_temp.split(' ')])   #to implement lemmetization i.e. to group together different forms of a word
  mbti_df._set_value(i, 'posts', post_temp)


In [14]:
print(mbti_df)

      type                                              posts
0     INFJ   enfp intj moment sportscenter top ten play pr...
1     ENTP   finding lack post alarming sex boring positio...
2     INTP   good one course say know blessing curse absol...
3     INTJ   dear intp enjoyed conversation day esoteric g...
4     ENTJ   fired another silly misconception approaching...
...    ...                                                ...
8670  ISFP   ixfp always think cat fi doms reason especial...
8671  ENFP   thread already exists someplace else post hec...
8672  INTP   many question thing would take purple pill pi...
8673  INFP   conflicted right come wanting child honestly ...
8674  INFP   long since personalitycafe although seem chan...

[8675 rows x 2 columns]


In [15]:
from sklearn.model_selection import train_test_split
train_data,test_data=train_test_split(mbti_df,test_size=0.2,random_state=42,stratify=mbti_df.type)

print(test_data)

      type                                              posts
7814  INFP   macona depends big family extroverted people ...
2233  ENFJ   blodsmak sveltihel brilliant episode regenera...
7261  INFJ   heylena lol compliment accepted thank jeesh f...
7794  INFJ   pac right rocket coffin like packed warhead r...
2950  INTJ   title thread misleading mention world dominat...
...    ...                                                ...
2006  INTJ   one sentence restrictive accurately portray d...
7137  ISTJ   wanted like odd hybrid dr james wilson house ...
6091  ENTP   took cognitive process test got cognitive pro...
2997  INFJ   get caught fantacy relationship better forget...
5458  ENTJ   doll love movie listed make think tritype one...

[1735 rows x 2 columns]


In [16]:
vectorizer=TfidfVectorizer( max_features=5000,stop_words='english')
vectorizer.fit(train_data.posts)
train_post=vectorizer.transform(train_data.posts).toarray()
test_post=vectorizer.transform(test_data.posts).toarray()

In [17]:
from sklearn.preprocessing import LabelEncoder
target_encoder=LabelEncoder()
train_target=target_encoder.fit_transform(train_data.type)
test_target=target_encoder.fit_transform(test_data.type)

# # Set classification mode
# classification_mode = 3

# # Get binary classification labels
# train_labels = select_classification_dimension(train_data, classification_mode)
# test_labels = select_classification_dimension(test_data, classification_mode)

# # Encode labels
# target_encoder = LabelEncoder()
# train_target = target_encoder.fit_transform(train_labels)
# test_target = target_encoder.fit_transform(test_labels)

# # Verify labels
# print(f"Label classes: {target_encoder.classes_}")
# print(f"Encoded train labels (first 5): {train_target[:5]}")
# print(f"Unique encoded labels: {np.unique(train_target)}")

In [18]:
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

def extract_stylometric_features(text):
    # print(f"Input text: {text}")
    sentences = sent_tokenize(text)
    # print(f"Split sentences: {sentences}")
    words = word_tokenize(text)
    # print(f"Words: {words}")
    num_sentences = len(sentences)
    # print(f"Number of sentences: {num_sentences}")
    num_words = len(words)
    # print(f"Number of words: {num_words}")
    num_chars = len(text)
    # print(f"Number of characters: {num_chars}")
    num_exclamations = text.count('!')
    # print(f"Number of exclamation marks: {num_exclamations}")
    num_questions = text.count('?')
    # print(f"Number of question marks: {num_questions}")
    num_uppercase_words = sum(1 for w in words if w.isupper())
    # print(f"Number of uppercase words: {num_uppercase_words}")
    lexical_diversity = len(set(words)) / num_words if num_words > 0 else 0
    # print(f"Lexical diversity: {lexical_diversity}")
    avg_word_length = np.mean([len(w) for w in words]) if words else 0
    # print(f"Average word length: {avg_word_length}")
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
    # print(f"Average sentence length: {avg_sentence_length}")

    return [
        num_sentences,
        num_words,
        num_chars,
        avg_word_length,
        avg_sentence_length,
        num_exclamations,
        num_questions,
        num_uppercase_words,
        lexical_diversity
    ]

# Apply to both train and test
train_stylo = train_data["posts"].apply(extract_stylometric_features).tolist()
test_stylo = test_data["posts"].apply(extract_stylometric_features).tolist()
# print("First 3 elements of train_stylo:", train_stylo[:3])
# print("First 3 elements of test_stylo:", test_stylo[:3])
# Convert to numpy arrays
train_stylo_np = np.array(train_stylo)
test_stylo_np = np.array(test_stylo)
# print("Shape of train_stylo_np:", train_stylo_np.shape)
# print("Shape of test_stylo_np:", test_stylo_np.shape)
# print("First 3 rows of train_stylo_np:", train_stylo_np[:3, :])

X_train_combined_stylo = np.hstack([train_post, train_stylo_np])
X_test_combined_stylo = np.hstack([test_post, test_stylo_np])
# print("Shape of train_post:", train_post.shape)
# print("Shape of test_post:", test_post.shape)
# print("Shape of X_train_combined:", X_train_combined.shape)
# print("Shape of X_test_combined:", X_test_combined.shape)

# sample_post = train_data["posts"].iloc[0]
# features = extract_stylometric_features(sample_post)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [19]:
from collections import Counter

# Define MBTI traits
traits = ['I', 'E', 'N', 'S', 'T', 'F', 'J', 'P']

# 1: Group posts by trait to build trait_keywords
trait_groups = {trait: [] for trait in traits}
for i, row in mbti_df.iterrows():
    for t in row['type']:
        if t in traits:
            trait_groups[t].append(row['posts'])

# print("trait_groups['I']:")
# for post in trait_groups['I']:
#     print(f"  - {post[:30]}...")
# print("\ntrait_groups['E']:")
# for post in trait_groups['E']:
#     print(f"  - {post[:30]}...")
# print("\ntrait_groups['N']:")
# for post in trait_groups['N']:
#     print(f"  - {post[:30]}...")

# 2: Extract top TF-IDF keywords for each trait
def clean_tokenizer(text):
    custom_stopwords = set([
        'like', 'just', 'don', 'com', 'http', 'www', 'youtube', 'watch', 'infp',
        'intj', 'infj', 'intp', 'enfp', 'entp', 'type', 'https', 've', 'istp'
    ])
    tokens = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    return [t for t in tokens if t not in custom_stopwords]

from sklearn.feature_extraction.text import TfidfVectorizer
trait_keywords = {}
top_k = 20

for trait, posts in trait_groups.items():
    vectorizer = TfidfVectorizer(
        tokenizer=clean_tokenizer,
        stop_words='english',
        max_features=1000
    )
    tfidf = vectorizer.fit_transform(posts)
    mean_scores = tfidf.mean(axis=0).A1
    vocab = vectorizer.get_feature_names_out()
    top_indices = mean_scores.argsort()[::-1][:top_k]
    top_words = [vocab[i] for i in top_indices]
    trait_keywords[trait] = top_words

# 3: Build transition matrix
co_matrix = np.zeros((8, 8))
trait_index = {t: i for i, t in enumerate(traits)}
for mbti in mbti_df['type']:
    chars = list(mbti)
    for t1 in chars:
        for t2 in chars:
            if t1 != t2:
                i, j = trait_index[t1], trait_index[t2]
                co_matrix[i][j] += 1
row_sums = co_matrix.sum(axis=1, keepdims=True)
transition_matrix = co_matrix / row_sums

# 4: Define vector extraction function
def extract_trait_vector(text, trait_keywords, use_transition=False, transition_matrix=None, normalize=True):
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    word_counts = Counter(words)
    base_vector = np.array([
        sum(word_counts.get(w, 0) for w in trait_keywords[t]) for t in traits
    ])
    # Print base_vector for debugging
    # Print base_vector.shape # Sum the word counts of the top keywords for each trait, resulting in a vector of shape (8) for each post
    if not use_transition:
        return base_vector
    if base_vector.sum() == 0:
        return np.zeros(8)
    if normalize:
        base_vector = base_vector / base_vector.sum()
    return np.dot(base_vector, transition_matrix)



In [20]:
# vector (using transition matrix)
train_trait_vector = np.vstack(
    train_data['posts'].apply(lambda x: extract_trait_vector(x, trait_keywords, use_transition=True, transition_matrix=transition_matrix, normalize=True))
)
# print(train_trait_vector.shape) #construct the shape of （6940，8）

test_trait_vector = np.vstack(
    test_data['posts'].apply(lambda x: extract_trait_vector(x, trait_keywords, use_transition=True, transition_matrix=transition_matrix, normalize=True))
)

In [21]:

train_combined = np.hstack([train_post, train_trait_vector])
# print(train_combined.shape)
test_combined = np.hstack([test_post, test_trait_vector])
train_combined_stylo = np.hstack([X_train_combined_stylo, train_trait_vector])
test_combined_stylo = np.hstack([X_test_combined_stylo, test_trait_vector])


In [21]:
# Classifier 1: XGBoost Classifier
# Use TF-IDF features only

import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

model_xgb_tfidf = xgb.XGBClassifier(tree_method='hist', device='cuda')
model_xgb_tfidf.fit(train_post, train_target)
pred_tfidf = model_xgb_tfidf.predict(test_post)
print("Test accuracy score for XGBoost Classifier with TF-IDF Features Only:\n", accuracy_score(test_target, pred_tfidf))
print("Classification Report of XGBoost Classifier with TF-IDF Features Only:")
print(classification_report(test_target, pred_tfidf, target_names=target_encoder.classes_, zero_division=0))

Test accuracy score for XGBoost Classifier with TF-IDF Features Only:
 0.6587896253602306
Classification Report of XGBoost Classifier with TF-IDF Features Only:
              precision    recall  f1-score   support

        ENFJ       0.59      0.42      0.49        38
        ENFP       0.69      0.59      0.64       135
        ENTJ       0.67      0.39      0.49        46
        ENTP       0.59      0.59      0.59       137
        ESFJ       1.00      0.11      0.20         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       1.00      0.25      0.40         8
        ESTP       0.46      0.33      0.39        18
        INFJ       0.68      0.72      0.70       294
        INFP       0.66      0.80      0.72       366
        INTJ       0.63      0.67      0.65       218
        INTP       0.70      0.77      0.73       261
        ISFJ       0.64      0.55      0.59        33
        ISFP       0.66      0.35      0.46        54
        ISTJ       0.65     

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [22]:
# Classifier 1: XGBoost Classifier
# Use TF-IDF features + Trait Vector

model_xgb_combined = xgb.XGBClassifier(tree_method='hist', device='cuda')
model_xgb_combined.fit(train_combined, train_target)
pred_combined = model_xgb_combined.predict(test_combined)
print("Test accuracy score for XGBoost Classifier with TF-IDF Features+Trait Vector:\n", accuracy_score(test_target, pred_combined))
print("Classification Report of XGBoost Classifier with TF-IDF Features+Trait Vector:")
print(classification_report(test_target, pred_combined, target_names=target_encoder.classes_, zero_division=0))

Test accuracy score for XGBoost Classifier with TF-IDF Features+Trait Vector:
 0.6651296829971182
Classification Report of XGBoost Classifier with TF-IDF Features+Trait Vector:
              precision    recall  f1-score   support

        ENFJ       0.57      0.42      0.48        38
        ENFP       0.70      0.59      0.64       135
        ENTJ       0.64      0.39      0.49        46
        ENTP       0.61      0.62      0.61       137
        ESFJ       1.00      0.11      0.20         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.75      0.38      0.50         8
        ESTP       0.42      0.28      0.33        18
        INFJ       0.69      0.72      0.71       294
        INFP       0.66      0.81      0.73       366
        INTJ       0.65      0.64      0.65       218
        INTP       0.70      0.79      0.74       261
        ISFJ       0.59      0.48      0.53        33
        ISFP       0.65      0.37      0.47        54
        ISTJ

In [23]:
# Classifier 1: XGBoost Classifier
# Use TF-IDF + Stylometric features
model_xgb_stylo = xgb.XGBClassifier(tree_method='hist', device='cuda')
model_xgb_stylo.fit(X_train_combined_stylo, train_target)
pred_stylo = model_xgb_stylo.predict(X_test_combined_stylo)
print("Test accuracy score for XGBoost Classifier with TF-IDF + stylometric features:\n", accuracy_score(test_target, pred_stylo))
print("Classification Report for XGBoost Classifier with TF-IDF + stylometric features:\n")
print(classification_report(test_target, pred_stylo, target_names=target_encoder.classes_, zero_division=0))


Test accuracy score for XGBoost Classifier with TF-IDF + stylometric features:
 0.6582132564841499
Classification Report for XGBoost Classifier with TF-IDF + stylometric features:

              precision    recall  f1-score   support

        ENFJ       0.61      0.45      0.52        38
        ENFP       0.68      0.58      0.62       135
        ENTJ       0.66      0.46      0.54        46
        ENTP       0.58      0.59      0.59       137
        ESFJ       1.00      0.11      0.20         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       1.00      0.38      0.55         8
        ESTP       0.50      0.28      0.36        18
        INFJ       0.67      0.70      0.69       294
        INFP       0.66      0.80      0.72       366
        INTJ       0.63      0.67      0.65       218
        INTP       0.70      0.78      0.74       261
        ISFJ       0.68      0.58      0.62        33
        ISFP       0.65      0.37      0.47        54
        

In [24]:
# Classifier 1: XGBoost Classifier
# Use TF-IDF + Stylometric features + Trait Vector
model_xgb_all = xgb.XGBClassifier(tree_method='hist', device='cuda')
model_xgb_all.fit(train_combined_stylo, train_target)
pred_all = model_xgb_all.predict(test_combined_stylo)
print("Test accuracy score for XGBoost with TF-IDF + Stylometric + trait vector features:\n", accuracy_score(test_target, pred_all))
print("Classification Report for XGBoost with TF-IDF + Stylometric + trait vector features:\n")
print(classification_report(test_target, pred_all, target_names=target_encoder.classes_, zero_division=0))

Test accuracy score for XGBoost with TF-IDF + Stylometric + trait vector features:
 0.6639769452449568
Classification Report for XGBoost with TF-IDF + Stylometric + trait vector features:

              precision    recall  f1-score   support

        ENFJ       0.65      0.39      0.49        38
        ENFP       0.71      0.63      0.67       135
        ENTJ       0.66      0.41      0.51        46
        ENTP       0.60      0.61      0.61       137
        ESFJ       1.00      0.11      0.20         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       1.00      0.38      0.55         8
        ESTP       0.45      0.28      0.34        18
        INFJ       0.68      0.72      0.70       294
        INFP       0.66      0.78      0.72       366
        INTJ       0.63      0.66      0.64       218
        INTP       0.71      0.77      0.74       261
        ISFJ       0.67      0.48      0.56        33
        ISFP       0.61      0.41      0.49        54


In [25]:
# Classifier 2: LightGBM Classifier
# Use TF-IDF features only
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score

model_lgb_tfidf = lgb.LGBMClassifier(random_state=42, device='gpu',verbose=-1)
model_lgb_tfidf.fit(train_post, train_target)
pred_tfidf = model_lgb_tfidf.predict(test_post)
print("Test accuracy score  for LightGBM Classifier with TF-IDF Features Only:\n", accuracy_score(test_target, pred_tfidf))
print("Classification Report of LightGBM Classifier for TF-IDF Features Only:")
print(classification_report(test_target, pred_tfidf, target_names=target_encoder.classes_, zero_division=0))



Test accuracy score  for LightGBM Classifier with TF-IDF Features Only:
 0.6708933717579251
Classification Report of LightGBM Classifier for TF-IDF Features Only:
              precision    recall  f1-score   support

        ENFJ       0.68      0.45      0.54        38
        ENFP       0.70      0.62      0.66       135
        ENTJ       0.68      0.33      0.44        46
        ENTP       0.61      0.61      0.61       137
        ESFJ       1.00      0.11      0.20         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.57      0.22      0.32        18
        INFJ       0.67      0.75      0.71       294
        INFP       0.65      0.80      0.72       366
        INTJ       0.64      0.68      0.66       218
        INTP       0.70      0.79      0.74       261
        ISFJ       0.80      0.48      0.60        33
        ISFP       0.76      0.48      0.59        54
        ISTJ       0.72   



In [26]:
# Classifier 2: LightGBM Classifier
# Use TF-IDF features + Trait Vector

model_lgb_combined = lgb.LGBMClassifier(random_state=42, device='gpu',verbose=-1)
model_lgb_combined.fit(train_combined, train_target)
pred_combined = model_lgb_combined.predict(test_combined)
print("Test accuracy score  for LightGBM Classifier with TF-IDF Features+Trait Vector:\n", accuracy_score(test_target, pred_combined))
print("Classification Report of LightGBM Classifier with TF-IDF Features+Trait Vector:")
print(classification_report(test_target, pred_combined, target_names=target_encoder.classes_, zero_division=0))



Test accuracy score  for LightGBM Classifier with TF-IDF Features+Trait Vector:
 0.6657060518731989
Classification Report of LightGBM Classifier with TF-IDF Features+Trait Vector:
              precision    recall  f1-score   support

        ENFJ       0.76      0.42      0.54        38
        ENFP       0.69      0.63      0.66       135
        ENTJ       0.64      0.35      0.45        46
        ENTP       0.64      0.61      0.62       137
        ESFJ       0.00      0.00      0.00         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.50      0.17      0.25        18
        INFJ       0.67      0.75      0.71       294
        INFP       0.65      0.82      0.73       366
        INTJ       0.63      0.63      0.63       218
        INTP       0.68      0.78      0.72       261
        ISFJ       0.83      0.45      0.59        33
        ISFP       0.76      0.46      0.57        54
        I



In [27]:
# Classifier 2: LightGBM Classifier
# Use TF-IDF + Stylometric features
model_lgb_stylo = lgb.LGBMClassifier(random_state=42, device='gpu',verbose=-1)
model_lgb_stylo.fit(X_train_combined_stylo, train_target)
pred_stylo = model_lgb_stylo.predict(X_test_combined_stylo)
print("Test accuracy score for LightGBM Classifier with TF-IDF + stylometric features:\n", accuracy_score(test_target, pred_stylo))
print("Classification Report for LightGBM Classifier with TF-IDF + stylometric features:\n")
print(classification_report(test_target, pred_stylo, target_names=target_encoder.classes_, zero_division=0))




Test accuracy score for LightGBM Classifier with TF-IDF + stylometric features:
 0.6622478386167147
Classification Report for LightGBM Classifier with TF-IDF + stylometric features:

              precision    recall  f1-score   support

        ENFJ       0.71      0.45      0.55        38
        ENFP       0.70      0.62      0.66       135
        ENTJ       0.62      0.33      0.43        46
        ENTP       0.62      0.61      0.61       137
        ESFJ       1.00      0.22      0.36         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.57      0.22      0.32        18
        INFJ       0.67      0.74      0.71       294
        INFP       0.65      0.80      0.71       366
        INTJ       0.63      0.66      0.64       218
        INTP       0.68      0.78      0.73       261
        ISFJ       0.75      0.45      0.57        33
        ISFP       0.77      0.44      0.56        54
      



In [28]:
# Classifier 2: LightGBM Classifier
# Use TF-IDF + Stylometric features + Trait Vector
model_lgb_all = lgb.LGBMClassifier(random_state=42, device='gpu',verbose=-1)
model_lgb_all.fit(train_combined_stylo, train_target)
pred_all = model_lgb_all.predict(test_combined_stylo)
print("Test accuracy score for LightGBM Classifier with TF-IDF + Stylometric + trait vector features:\n", accuracy_score(test_target, pred_all))
print("Classification Report for LightGBM Classifier with TF-IDF + Stylometric + trait vector features:\n")
print(classification_report(test_target, pred_all, target_names=target_encoder.classes_, zero_division=0))



Test accuracy score for LightGBM Classifier with TF-IDF + Stylometric + trait vector features:
 0.6680115273775216
Classification Report for LightGBM Classifier with TF-IDF + Stylometric + trait vector features:

              precision    recall  f1-score   support

        ENFJ       0.71      0.39      0.51        38
        ENFP       0.67      0.61      0.64       135
        ENTJ       0.73      0.35      0.47        46
        ENTP       0.66      0.63      0.64       137
        ESFJ       1.00      0.22      0.36         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.60      0.17      0.26        18
        INFJ       0.68      0.77      0.72       294
        INFP       0.65      0.81      0.72       366
        INTJ       0.62      0.64      0.63       218
        INTP       0.69      0.79      0.73       261
        ISFJ       0.76      0.39      0.52        33
        ISFP       0.75      0



In [22]:
# Classifier 3: Logistic Regression Classifier
# Use TF-IDF features only
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model_lr_tfidf = LogisticRegression()
model_lr_tfidf.fit(train_post, train_target)
pred_tfidf = model_lr_tfidf.predict(test_post)

print("Test accuracy score for model trained on Logistic Regression for TF-IDF Features Only:\n",
      accuracy_score(test_target, pred_tfidf))
print("Classification Report of Logistic Regression Classifier for TF-IDF Features Only:")
print(classification_report(test_target, pred_tfidf, target_names=target_encoder.classes_, zero_division=0))

Test accuracy score for model trained on Logistic Regression for TF-IDF Features Only:
 0.6495677233429394
Classification Report of Logistic Regression Classifier for TF-IDF Features Only:
              precision    recall  f1-score   support

        ENFJ       0.64      0.18      0.29        38
        ENFP       0.75      0.59      0.66       135
        ENTJ       0.60      0.26      0.36        46
        ENTP       0.66      0.53      0.59       137
        ESFJ       1.00      0.11      0.20         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       1.00      0.11      0.20        18
        INFJ       0.65      0.71      0.68       294
        INFP       0.60      0.86      0.71       366
        INTJ       0.62      0.67      0.64       218
        INTP       0.68      0.84      0.75       261
        ISFJ       0.67      0.18      0.29        33
        ISFP       0.80      0.30      0.43        54


In [23]:
# Classifier 3: Logistic Regression Classifier
# Use TF-IDF features + Trait Vector
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model_lr_combined = LogisticRegression()
model_lr_combined.fit(train_combined, train_target)
pred_combined = model_lr_combined.predict(test_combined)

print("Test accuracy score for model trained on Logistic Regression TF-IDF Features + Trait Vector:\n",
      accuracy_score(test_target, pred_combined))
print("Classification Report of Logistic Regression Classifier for TF-IDF Features + Trait Vector:")
print(classification_report(test_target, pred_combined, target_names=target_encoder.classes_, zero_division=0))

Test accuracy score for model trained on Logistic Regression TF-IDF Features + Trait Vector:
 0.6495677233429394
Classification Report of Logistic Regression Classifier for TF-IDF Features + Trait Vector:
              precision    recall  f1-score   support

        ENFJ       0.64      0.18      0.29        38
        ENFP       0.75      0.59      0.66       135
        ENTJ       0.60      0.26      0.36        46
        ENTP       0.66      0.53      0.59       137
        ESFJ       1.00      0.11      0.20         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       1.00      0.11      0.20        18
        INFJ       0.66      0.71      0.68       294
        INFP       0.60      0.86      0.71       366
        INTJ       0.62      0.67      0.64       218
        INTP       0.68      0.84      0.75       261
        ISFJ       0.67      0.18      0.29        33
        ISFP       0.80      0.30     

In [24]:
# Classifier 3: Logistic Regression Classifier
# Use TF-IDF + Stylometric features
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model_lr_stylo = LogisticRegression()
model_lr_stylo.fit(X_train_combined_stylo, train_target)
pred_stylo = model_lr_stylo.predict(X_test_combined_stylo)
print("Test accuracy score for Logistic Regression Classifier with TF-IDF + stylometric features:\n", accuracy_score(test_target, pred_stylo))
print("Classification Report for Logistic Regression Classifier with TF-IDF + stylometric features:\n")
print(classification_report(test_target, pred_stylo, target_names=target_encoder.classes_, zero_division=0))


Test accuracy score for Logistic Regression Classifier with TF-IDF + stylometric features:
 0.2195965417867435
Classification Report for Logistic Regression Classifier with TF-IDF + stylometric features:

              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.07      0.02      0.03       135
        ENTJ       0.00      0.00      0.00        46
        ENTP       0.00      0.00      0.00       137
        ESFJ       0.00      0.00      0.00         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.00      0.00      0.00        18
        INFJ       0.00      0.00      0.00       294
        INFP       0.22      0.86      0.35       366
        INTJ       0.24      0.06      0.09       218
        INTP       0.23      0.19      0.21       261
        ISFJ       0.00      0.00      0.00        33
        ISFP       0.00      0.00     

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
# Classifier 3: Logistic Regression Classifier
# Use TF-IDF + Stylometric features + Trait Vector
model_lr_all = LogisticRegression()
model_lr_all.fit(train_combined_stylo, train_target)
pred_all = model_lr_all.predict(test_combined_stylo)
print("Test accuracy score for Logistic Regression Classifier with TF-IDF + Stylometric + trait vector features:\n", accuracy_score(test_target, pred_all))
print("Classification Report for Logistic Regression Classifier with TF-IDF + Stylometric + trait vector features:\n")
print(classification_report(test_target, pred_all, target_names=target_encoder.classes_, zero_division=0))

Test accuracy score for Logistic Regression Classifier with TF-IDF + Stylometric + trait vector features:
 0.21671469740634006
Classification Report for Logistic Regression Classifier with TF-IDF + Stylometric + trait vector features:

              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.08      0.02      0.03       135
        ENTJ       0.00      0.00      0.00        46
        ENTP       0.00      0.00      0.00       137
        ESFJ       0.00      0.00      0.00         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.00      0.00      0.00        18
        INFJ       0.00      0.00      0.00       294
        INFP       0.22      0.87      0.35       366
        INTJ       0.23      0.06      0.09       218
        INTP       0.23      0.16      0.19       261
        ISFJ       0.00      0.00      0.00        33
       

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# Classifier 4: Support Vector Classifier
# Use TF-IDF features only
from cuml.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

model_svc_tfidf = SVC(random_state=42)
model_svc_tfidf.fit(train_post, train_target)
pred_tfidf = model_svc_tfidf.predict(test_post)
pred_training_tfidf = model_svc_tfidf.predict(train_post)

print("Test accuracy score for model trained on Support Vector Classifier TF-IDF Features Only:\n",
      accuracy_score(test_target, pred_tfidf))
print("Test classification report of Support Vector Classifier for TF-IDF Features Only:",
      classification_report(test_target, pred_tfidf, target_names=target_encoder.classes_, zero_division=0))

Test accuracy score for model trained on Support Vector Classifier (TF-IDF only):
 0.6512968299711815
Test classification report of Support Vector Classifier for TF-IDF Features Only:               precision    recall  f1-score   support

        ENFJ       0.67      0.26      0.38        38
        ENFP       0.77      0.57      0.66       135
        ENTJ       0.75      0.26      0.39        46
        ENTP       0.68      0.52      0.59       137
        ESFJ       0.33      0.11      0.17         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.80      0.22      0.35        18
        INFJ       0.67      0.69      0.68       294
        INFP       0.59      0.86      0.70       366
        INTJ       0.65      0.64      0.65       218
        INTP       0.65      0.84      0.74       261
        ISFJ       0.83      0.30      0.44        33
        ISFP       0.78      0.33      0.47        54
     

In [54]:
# Classifier 4: Support Vector Classifier
# Use TF-IDF features + Trait Vector
model_svc_combined = SVC(random_state=42, verbose=False)
model_svc_combined.fit(train_combined, train_target)
pred_combined = model_svc_combined.predict(test_combined)
pred_training_combined = model_svc_combined.predict(train_combined)

print("Test accuracy score for model trained on Support Vector Classifier with TF-IDF Features + Trait Vector:",
      accuracy_score(test_target, pred_combined))

print("Test classification report of Support Vector Classifier with TF-IDF Features + Trait Vector:\n",
      classification_report(test_target, pred_combined, target_names=target_encoder.classes_, zero_division=0))

Test accuracy score for model trained on Support Vector Classifier (Combined features): 0.6547550432276658
Test classification report of Support Vector Classifier for TF-IDF Features + Trait Vector:
               precision    recall  f1-score   support

        ENFJ       0.69      0.29      0.41        38
        ENFP       0.76      0.58      0.66       135
        ENTJ       0.72      0.28      0.41        46
        ENTP       0.68      0.53      0.59       137
        ESFJ       0.33      0.11      0.17         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.83      0.28      0.42        18
        INFJ       0.67      0.70      0.69       294
        INFP       0.59      0.86      0.70       366
        INTJ       0.65      0.64      0.65       218
        INTP       0.65      0.84      0.74       261
        ISFJ       0.83      0.30      0.44        33
        ISFP       0.78      0.33      0.47

In [40]:
# Classifier 4: Support Vector Classifier
# Use TF-IDF + Stylometric features
model_svc_stylo = SVC(random_state=42)
model_svc_stylo.fit(X_train_combined_stylo, train_target)
pred_stylo = model_lgb_stylo.predict(X_test_combined_stylo)
print("Test accuracy score for Support Vector Classifier with TF-IDF + stylometric features:\n", accuracy_score(test_target, pred_stylo))
print("Classification Report for Support Vector Classifier with TF-IDF + stylometric features:\n")
print(classification_report(test_target, pred_stylo, target_names=target_encoder.classes_, zero_division=0))


Test accuracy score for Support Vector Classifier with TF-IDF + stylometric features:
 0.6622478386167147
Classification Report for Support Vector Classifier with TF-IDF + stylometric features:

              precision    recall  f1-score   support

        ENFJ       0.71      0.45      0.55        38
        ENFP       0.70      0.62      0.66       135
        ENTJ       0.62      0.33      0.43        46
        ENTP       0.62      0.61      0.61       137
        ESFJ       1.00      0.22      0.36         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.57      0.22      0.32        18
        INFJ       0.67      0.74      0.71       294
        INFP       0.65      0.80      0.71       366
        INTJ       0.63      0.66      0.64       218
        INTP       0.68      0.78      0.73       261
        ISFJ       0.75      0.45      0.57        33
        ISFP       0.77      0.44      0.56     



In [41]:
# Classifier 4: Support Vector Classifier
# Use TF-IDF + Stylometric features + Trait Vector
model_svc_all = SVC(random_state=42)
model_svc_all.fit(train_combined_stylo, train_target)
pred_all = model_lgb_all.predict(test_combined_stylo)
print("Test accuracy score for Support Vector Classifier with TF-IDF + Stylometric + trait vector features:\n", accuracy_score(test_target, pred_all))
print("Classification Report for Support Vector Classifier with TF-IDF + Stylometric + trait vector features:\n")
print(classification_report(test_target, pred_all, target_names=target_encoder.classes_, zero_division=0))

Test accuracy score for Support Vector Classifier with TF-IDF + Stylometric + trait vector features:
 0.6680115273775216
Classification Report for Support Vector Classifier with TF-IDF + Stylometric + trait vector features:

              precision    recall  f1-score   support

        ENFJ       0.71      0.39      0.51        38
        ENFP       0.67      0.61      0.64       135
        ENTJ       0.73      0.35      0.47        46
        ENTP       0.66      0.63      0.64       137
        ESFJ       1.00      0.22      0.36         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.60      0.17      0.26        18
        INFJ       0.68      0.77      0.72       294
        INFP       0.65      0.81      0.72       366
        INTJ       0.62      0.64      0.63       218
        INTP       0.69      0.79      0.73       261
        ISFJ       0.76      0.39      0.52        33
        ISFP      

