In [1]:
# Load dataset
import pandas as pd
mbti_df = pd.read_csv('mbti_1.csv')

import os
os.environ["CUML_LOG_LEVEL"] = "ERROR"


In [2]:
# data preprocess
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')

# 1. Text normalization
# (i) convert to lowercase;
# (ii) remove url;
# (iii) remove numbers;
# (iv) remove non-alphanumeric characters (punctuation, special characters);
# (v) remove underscores and signs;
# (vi) replace multiple spaces with single spaces;
# (vii) remove stopwords;
# (viii) remove one-letter words;
def clean_text(text):
    text = str(text)
    text = text.lower()
    pattern = re.compile(r'https?://[a-zA-Z0-9./-]*/[a-zA-Z0-9?=_.]*[_0-9.a-zA-Z/-]*')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'[0-9]')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'\W+')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'[_+]')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'\s+')
    text = re.sub(pattern, ' ', text).strip()
    stop_words = stopwords.words("english")
    text = " ".join([w for w in text.split() if w not in stop_words])
    text = ' '.join([word for word in text.split() if len(word) > 1])
    return text

# 2. Lemmatization
# (i) use NLTK's lemmatizer
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# 3. Select classification dimension (16-class classification or binary classification on single dimension)
def select_classification_dimension(df, num=0):
    if num == 16:
        return df['type']
    elif 1 <= num <= 4:
        return df['type'].str[num-1]
    else:
        print("selection error of classification dimension!")
        return df['type']

# 4. Label encoding
# (i) creates an array corresponding to the type labels.
def encode_labels(column):
    le = LabelEncoder()
    y = le.fit_transform(column)
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    print(f"Label encoding mapping: {mapping}")
    print(f"Encoded label examples: {y[:5]}")
    print(f"Unique encoded labels: {np.unique(y)}")
    return y, le

mbti_df["posts"] = mbti_df["posts"].str.lower()       #converts text in posts to lowercase as it is preferred in nlp

for i in range(len(mbti_df)):
  post_temp=mbti_df._get_value(i, 'posts')
  pattern = re.compile(r'https?://[a-zA-Z0-9./-]*/[a-zA-Z0-9?=_.]*[_0-9.a-zA-Z/-]*')    #to match url links present in the post
  post_temp= re.sub(pattern, ' ', post_temp)                                            #to replace that url link with space
  mbti_df._set_value(i, 'posts',post_temp)

for i in range(len(mbti_df)):
  post_temp=mbti_df._get_value(i, 'posts')
  pattern = re.compile(r'[0-9]')                                    #to match numbers from 0 to 9
  post_temp= re.sub(pattern, ' ', post_temp)                        #to replace them with space
  pattern = re.compile('\W+')                                       #to match alphanumeric characters
  post_temp= re.sub(pattern, ' ', post_temp)                        #to replace them with space
  pattern = re.compile(r'[_+]')
  post_temp= re.sub(pattern, ' ', post_temp)
  mbti_df._set_value(i, 'posts',post_temp)

for i in range(len(mbti_df)):
  post_temp=mbti_df._get_value(i, 'posts')
  pattern = re.compile('\s+')                                     #to match multiple whitespaces
  post_temp= re.sub(pattern, ' ', post_temp)                      #to replace them with single whitespace
  mbti_df._set_value(i, 'posts', post_temp)
remove_words = stopwords.words("english")                         # remove stopwords
for i in range(mbti_df.shape[0]):
  post_temp=mbti_df._get_value(i, 'posts')
  post_temp=" ".join([w for w in post_temp.split(' ') if w not in remove_words])    #to remove stopwords
  mbti_df._set_value(i, 'posts', post_temp)

lemmatizer = WordNetLemmatizer()
for i in range(mbti_df.shape[0]):
  post_temp=mbti_df._get_value(i, 'posts')
  post_temp=" ".join([lemmatizer.lemmatize(w) for w in post_temp.split(' ')])   #to implement lemmetization i.e. to group together different forms of a word
  mbti_df._set_value(i, 'posts', post_temp)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/meitongliu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/meitongliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.model_selection import train_test_split
train_data,test_data=train_test_split(mbti_df,test_size=0.2,random_state=42,stratify=mbti_df.type)

print(test_data)

      type                                              posts
7814  INFP   macona depends big family extroverted people ...
2233  ENFJ   blodsmak sveltihel brilliant episode regenera...
7261  INFJ   heylena lol compliment accepted thank jeesh f...
7794  INFJ   pac right rocket coffin like packed warhead r...
2950  INTJ   title thread misleading mention world dominat...
...    ...                                                ...
2006  INTJ   one sentence restrictive accurately portray d...
7137  ISTJ   wanted like odd hybrid dr james wilson house ...
6091  ENTP   took cognitive process test got cognitive pro...
2997  INFJ   get caught fantacy relationship better forget...
5458  ENTJ   doll love movie listed make think tritype one...

[1735 rows x 2 columns]


In [4]:
from sklearn.preprocessing import LabelEncoder
# vectorize the data with TF-IDF

vectorizer=TfidfVectorizer( max_features=5000,stop_words='english')
vectorizer.fit(train_data.posts)
train_post=vectorizer.transform(train_data.posts).toarray()
test_post=vectorizer.transform(test_data.posts).toarray()
target_encoder=LabelEncoder()
train_target=target_encoder.fit_transform(train_data.type)
test_target = target_encoder.transform(test_data.type)        


In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

def extract_stylometric_features(text):
    # print(f"Input text: {text}")
    sentences = sent_tokenize(text)
    # print(f"Split sentences: {sentences}")
    words = word_tokenize(text)
    # print(f"Words: {words}")
    num_sentences = len(sentences)
    # print(f"Number of sentences: {num_sentences}")
    num_words = len(words)
    # print(f"Number of words: {num_words}")
    num_chars = len(text)
    # print(f"Number of characters: {num_chars}")
    num_exclamations = text.count('!')
    # print(f"Number of exclamation marks: {num_exclamations}")
    num_questions = text.count('?')
    # print(f"Number of question marks: {num_questions}")
    num_uppercase_words = sum(1 for w in words if w.isupper())
    # print(f"Number of uppercase words: {num_uppercase_words}")
    lexical_diversity = len(set(words)) / num_words if num_words > 0 else 0
    # print(f"Lexical diversity: {lexical_diversity}")
    avg_word_length = np.mean([len(w) for w in words]) if words else 0
    # print(f"Average word length: {avg_word_length}")
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
    # print(f"Average sentence length: {avg_sentence_length}")

    return [
        num_sentences,
        num_words,
        num_chars,
        avg_word_length,
        avg_sentence_length,
        num_exclamations,
        num_questions,
        num_uppercase_words,
        lexical_diversity
    ]

# Apply to both train and test
train_stylo = train_data["posts"].apply(extract_stylometric_features).tolist()
test_stylo = test_data["posts"].apply(extract_stylometric_features).tolist()
# Convert to numpy arrays
train_stylo_np = np.array(train_stylo)
test_stylo_np = np.array(test_stylo)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_stylo_scaled = scaler.fit_transform(train_stylo_np)  
test_stylo_scaled = scaler.transform(test_stylo_np)       

# train_stylo_scaled = train_stylo_np
# test_stylo_scaled = test_stylo_np      



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/meitongliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/meitongliu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
from collections import Counter

# Define MBTI traits
traits = ['I', 'E', 'N', 'S', 'T', 'F', 'J', 'P']

# 1: Group posts by trait to build trait_keywords
trait_groups = {trait: [] for trait in traits}
for i, row in mbti_df.iterrows():
    for t in row['type']:
        if t in traits:
            trait_groups[t].append(row['posts'])


# 2: Extract top TF-IDF keywords for each trait
def clean_tokenizer(text):
    custom_stopwords = set([
        'like', 'just', 'don', 'com', 'http', 'www', 'youtube', 'watch', 'infp',
        'intj', 'infj', 'intp', 'enfp', 'entp', 'type', 'https', 've', 'istp'
    ])
    tokens = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    return [t for t in tokens if t not in custom_stopwords]

from sklearn.feature_extraction.text import TfidfVectorizer
trait_keywords = {}
top_k = 20

for trait, posts in trait_groups.items():
    vectorizer = TfidfVectorizer(
        tokenizer=clean_tokenizer,
        stop_words='english',
        max_features=1000
    )
    tfidf = vectorizer.fit_transform(posts)
    mean_scores = tfidf.mean(axis=0).A1
    vocab = vectorizer.get_feature_names_out()
    top_indices = mean_scores.argsort()[::-1][:top_k]
    top_words = [vocab[i] for i in top_indices]
    trait_keywords[trait] = top_words

# 3: Build transition matrix
co_matrix = np.zeros((8, 8))
trait_index = {t: i for i, t in enumerate(traits)}
for mbti in mbti_df['type']:
    chars = list(mbti)
    for t1 in chars:
        for t2 in chars:
            if t1 != t2:
                i, j = trait_index[t1], trait_index[t2]
                co_matrix[i][j] += 1
row_sums = co_matrix.sum(axis=1, keepdims=True)
transition_matrix = co_matrix / row_sums

# 4: Define vector extraction function
def extract_trait_vector(text, trait_keywords, use_transition=False, transition_matrix=None, normalize=True):
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    word_counts = Counter(words)
    base_vector = np.array([
        sum(word_counts.get(w, 0) for w in trait_keywords[t]) for t in traits
    ])
    # Print base_vector for debugging
    # Print base_vector.shape # Sum the word counts of the top keywords for each trait, resulting in a vector of shape (8) for each post
    if not use_transition:
        return base_vector
    if base_vector.sum() == 0:
        return np.zeros(8)
    if normalize:
        base_vector = base_vector / base_vector.sum()
    return np.dot(base_vector, transition_matrix)



In [7]:
# vector (using transition matrix)
train_trait_vector = np.vstack(
    train_data['posts'].apply(lambda x: extract_trait_vector(x, trait_keywords, use_transition=True, transition_matrix=transition_matrix, normalize=True))
)
# print(train_trait_vector.shape) #construct the shape of （6940，8）

test_trait_vector = np.vstack(
    test_data['posts'].apply(lambda x: extract_trait_vector(x, trait_keywords, use_transition=True, transition_matrix=transition_matrix, normalize=True))
)

scaler_trait = StandardScaler()
train_trait_vector_scaled = scaler_trait.fit_transform(train_trait_vector)  
test_trait_vector_scaled = scaler_trait.transform(test_trait_vector)       


# train_trait_vector_scaled = train_trait_vector
# test_trait_vector_scaled = test_trait_vector

In [8]:
print(test_trait_vector_scaled.shape)

(1735, 8)


In [9]:
# interface of the classification model

# tf-idf : train_post  test_post

# trait : train_trait_vector_scaled  test_trait_vector_scaled

# stylo : train_stylo_np  test_stylo_np

# trait+stylo. 17
train_trait_stylo = np.hstack([train_stylo_np, train_trait_vector_scaled])
test_trait_stylo = np.hstack([test_stylo_np, test_trait_vector_scaled])

# tfidf+trait.  5008
train_combined = np.hstack([train_post, train_trait_vector_scaled])
test_combined = np.hstack([test_post, test_trait_vector_scaled])

# tfidf+ stylo.  5009
X_train_combined_stylo = np.hstack([train_post, train_stylo_np])
X_test_combined_stylo = np.hstack([test_post, test_stylo_np])

# all 5017
train_combined_stylo = np.hstack([X_train_combined_stylo, train_trait_vector])
test_combined_stylo = np.hstack([X_test_combined_stylo, test_trait_vector])


In [10]:
# SMOTE application

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=22)

switch_of_SMOTE = 0   # 0: don't use smote        1: use smote
if switch_of_SMOTE:
    # 1. tfidf only
    X_train_tfidf, y_train_tfidf = smote.fit_resample(train_post, train_target)
    
    # 2. trait only
    X_train_trait , y_train_trait  = smote.fit_resample(train_trait_vector_scaled, train_target)
    
    # 3. stylo only
    X_train_stylo , y_train_stylo  = smote.fit_resample(train_stylo_np, train_target)
    
    # 4. trait + stylo
    X_train_trait_stylo , y_train_trait_stylo  = smote.fit_resample(train_trait_stylo, train_target)
    
    # 5. tfidf + trait
    X_train_combined , y_train_combined  = smote.fit_resample(train_combined, train_target)
    
    # 6. tfidf + stylo
    X_train_combined_stylo , y_train_combined_stylo  = smote.fit_resample(X_train_combined_stylo, train_target)
    
    # 7. tfidf + stylo + trait
    X_train_all , y_train_all  = smote.fit_resample(train_combined_stylo, train_target)

else:
    # 1. tfidf only
    X_train_tfidf, y_train_tfidf = train_post, train_target
    
    # 2. trait only
    X_train_trait , y_train_trait  = train_trait_vector_scaled, train_target
    
    # 3. stylo only
    X_train_stylo , y_train_stylo  = train_stylo_np, train_target
    
    # 4. trait + stylo
    X_train_trait_stylo , y_train_trait_stylo  = train_trait_stylo, train_target
    
    # 5. tfidf + trait
    X_train_combined , y_train_combined  = train_combined, train_target
    
    # 6. tfidf + stylo
    X_train_combined_stylo , y_train_combined_stylo  = X_train_combined_stylo, train_target
    
    # 7. tfidf + stylo + trait
    X_train_all , y_train_all  = train_combined_stylo, train_target


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

def dir_run_logistic_regression(X_train, y_train, X_test, y_test, label_encoder=None, balance= 0, model_name="Logistic Regression"):
    """
    Train and evaluate a logistic regression model on the given dataset.

    Args:
        X_train: ndarray, training features (SMOTE applied).
        y_train: ndarray, training labels.
        X_test: ndarray, test features.
        y_test: ndarray, test labels.
        label_encoder: sklearn LabelEncoder, used to convert label ids to names.
        model_name: str, used to name the evaluation report.

    Returns:
        model: trained LogisticRegression model.
    """
    print(f"=== Training {model_name} ===")
    model = None
    if balance:
        model = LogisticRegression(class_weight='balanced', max_iter=1000)
    else:
        model = LogisticRegression( max_iter=1000)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    print(f"\n[Test Accuracy for {model_name}]: {acc:.4f}")

    print("Test accuracy score is ", accuracy_score(test_target, pred_combined))

    print(f"\n[Classification Report for {model_name}]:")
    print(classification_report(y_test, preds, target_names=target_encoder.classes_, zero_division=0))

    return model


In [22]:
model_all = run_logistic_regression(
    X_train_tfidf, y_train_tfidf,
    test_post, test_target,
    model_name="TFIDF "
)

=== Training TFIDF  ===

[Test Accuracy for TFIDF ]: 0.6882

[Classification Report for TFIDF ]:
              precision    recall  f1-score   support

           0       0.60      0.66      0.62        38
           1       0.68      0.65      0.67       135
           2       0.55      0.67      0.61        46
           3       0.60      0.59      0.60       137
           4       0.33      0.33      0.33         9
           5       0.00      0.00      0.00        10
           6       0.71      0.62      0.67         8
           7       0.48      0.67      0.56        18
           8       0.77      0.68      0.72       294
           9       0.75      0.76      0.75       366
          10       0.66      0.65      0.66       218
          11       0.72      0.80      0.76       261
          12       0.65      0.61      0.62        33
          13       0.63      0.57      0.60        54
          14       0.59      0.56      0.57        41
          15       0.64      0.72     

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import os

def run_logistic_regression(X_train, y_train, X_test, y_test,
                            label_encoder=None,
                            model_name="Logistic Regression",
                            log_file_path="LR_non_smote_results_log.txt"):
    """
    Train and evaluate a logistic regression model, print and log results.

    Args:
        X_train, y_train: training data.
        X_test, y_test: testing data.
        label_encoder: optional, for label names in classification report.
        model_name: name for this model variant.
        log_file_path: file to which logs are written.
    """
    model = LogisticRegression(class_weight='balanced', max_iter=1000)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)

    if label_encoder is not None:
        target_names = label_encoder.classes_
    else:
        target_names = None

    report = classification_report(y_test, preds, target_names=target_encoder.classes_, zero_division=0)

    # Prepare log text
    log_text = f"\n{'='*60}\nModel: {model_name}\nAccuracy: {acc:.4f}\n{report}\n"

    # Print to console
    print(log_text)

    # Append to log file
    with open(log_file_path, "a") as f:
        f.write(log_text)

    return model


In [12]:
model1 = run_logistic_regression(X_train_tfidf, y_train_tfidf, test_post, test_target,  label_encoder=None, model_name="TF-IDF only")
model2 = run_logistic_regression(X_train_trait, y_train_trait, test_trait_vector_scaled, test_target,  label_encoder=None, model_name="Trait only")
model3 = run_logistic_regression(X_train_stylo, y_train_stylo, test_stylo_np, test_target,  label_encoder=None, model_name="Stylo only")
model4 = run_logistic_regression(X_train_trait_stylo, y_train_trait_stylo, test_trait_stylo, test_target,  label_encoder=None, model_name="Trait + Stylo")
model5 = run_logistic_regression(X_train_combined, y_train_combined, test_combined, test_target,  label_encoder=None, model_name="TF-IDF + Trait")
model6 = run_logistic_regression(X_train_combined_stylo, y_train_combined_stylo, X_test_combined_stylo, test_target,  label_encoder=None, model_name="TF-IDF + Stylo")
model7 = run_logistic_regression(X_train_all, y_train_all, test_combined_stylo, test_target,  label_encoder=None, model_name="TF-IDF + Stylo + Trait")



Model: TF-IDF only
Accuracy: 0.6680
              precision    recall  f1-score   support

        ENFJ       0.45      0.68      0.54        38
        ENFP       0.66      0.63      0.64       135
        ENTJ       0.45      0.67      0.54        46
        ENTP       0.62      0.62      0.62       137
        ESFJ       0.33      0.33      0.33         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.62      0.62      0.62         8
        ESTP       0.44      0.67      0.53        18
        INFJ       0.79      0.64      0.71       294
        INFP       0.76      0.70      0.73       366
        INTJ       0.68      0.63      0.65       218
        INTP       0.74      0.77      0.75       261
        ISFJ       0.59      0.67      0.63        33
        ISFP       0.47      0.57      0.52        54
        ISTJ       0.50      0.56      0.53        41
        ISTP       0.57      0.76      0.65        67

    accuracy                           0.67

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Model: Stylo only
Accuracy: 0.0790
              precision    recall  f1-score   support

        ENFJ       0.03      0.05      0.04        38
        ENFP       0.00      0.00      0.00       135
        ENTJ       0.02      0.09      0.03        46
        ENTP       0.12      0.01      0.03       137
        ESFJ       0.00      0.11      0.01         9
        ESFP       0.01      0.40      0.02        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.02      0.22      0.03        18
        INFJ       0.20      0.17      0.18       294
        INFP       0.00      0.00      0.00       366
        INTJ       0.20      0.28      0.23       218
        INTP       0.25      0.03      0.05       261
        ISFJ       0.00      0.00      0.00        33
        ISFP       0.00      0.00      0.00        54
        ISTJ       0.00      0.00      0.00        41
        ISTP       0.00      0.00      0.00        67

    accuracy                           0.08 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Model: Trait + Stylo
Accuracy: 0.0928
              precision    recall  f1-score   support

        ENFJ       0.02      0.03      0.02        38
        ENFP       0.20      0.05      0.08       135
        ENTJ       0.03      0.15      0.06        46
        ENTP       0.04      0.01      0.01       137
        ESFJ       0.01      0.11      0.02         9
        ESFP       0.01      0.20      0.02        10
        ESTJ       0.01      0.25      0.03         8
        ESTP       0.02      0.11      0.03        18
        INFJ       0.23      0.11      0.15       294
        INFP       0.25      0.01      0.02       366
        INTJ       0.22      0.24      0.23       218
        INTP       0.22      0.09      0.13       261
        ISFJ       0.08      0.24      0.12        33
        ISFP       0.11      0.09      0.10        54
        ISTJ       0.07      0.12      0.09        41
        ISTP       0.06      0.15      0.08        67

    accuracy                           0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Model: TF-IDF + Stylo
Accuracy: 0.3948
              precision    recall  f1-score   support

        ENFJ       0.35      0.58      0.44        38
        ENFP       0.39      0.44      0.41       135
        ENTJ       0.12      0.50      0.20        46
        ENTP       0.43      0.43      0.43       137
        ESFJ       0.12      0.33      0.18         9
        ESFP       0.05      0.10      0.07        10
        ESTJ       0.35      0.75      0.48         8
        ESTP       0.24      0.72      0.36        18
        INFJ       0.34      0.26      0.29       294
        INFP       0.60      0.40      0.48       366
        INTJ       0.49      0.32      0.39       218
        INTP       0.62      0.44      0.52       261
        ISFJ       0.51      0.58      0.54        33
        ISFP       0.25      0.26      0.25        54
        ISTJ       0.14      0.41      0.21        41
        ISTP       0.55      0.63      0.58        67

    accuracy                           0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

def run_svm_classifier(X_train, y_train, X_test, y_test,
                       label_encoder=None,
                       model_name="SVM",
                       log_file_path="SVM_non_smote_results_log.txt"):
    """
    Train and evaluate a Support Vector Machine classifier, print and log results.

    Args:
        X_train, y_train: training data.
        X_test, y_test: test data.
        label_encoder: optional LabelEncoder to show label names in report.
        model_name: name to label this model variant.
        log_file_path: path to append logs to.

    Returns:
        model: trained sklearn.svm.SVC model.
    """
    model = SVC(random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)


    report = classification_report(y_test, preds, target_names=target_encoder.classes_, zero_division=0)

    # Log + print output
    log_text = f"\n{'='*60}\nModel: {model_name}\nAccuracy: {acc:.4f}\n{report}\n"
    print(log_text)

    with open(log_file_path, "a") as f:
        f.write(log_text)

    return model


In [34]:
model_svm = run_svm_classifier(
    X_train_tfidf, y_train_tfidf,
    test_post, test_target,
    label_encoder=target_encoder,
    model_name="SVM with TF-IDF"
)



Model: SVM with TF-IDF
Accuracy: 0.6490
              precision    recall  f1-score   support

        ENFJ       0.65      0.29      0.40        38
        ENFP       0.75      0.57      0.65       135
        ENTJ       0.67      0.26      0.38        46
        ENTP       0.66      0.51      0.58       137
        ESFJ       0.50      0.22      0.31         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.71      0.28      0.40        18
        INFJ       0.67      0.69      0.68       294
        INFP       0.59      0.85      0.70       366
        INTJ       0.67      0.64      0.65       218
        INTP       0.65      0.83      0.73       261
        ISFJ       0.82      0.27      0.41        33
        ISFP       0.80      0.37      0.51        54
        ISTJ       0.76      0.32      0.45        41
        ISTP       0.75      0.54      0.63        67

    accuracy                           

In [14]:
model1 = run_svm_classifier(X_train_tfidf, y_train_tfidf, test_post, test_target,  label_encoder=None, model_name="TF-IDF only")
model2 = run_svm_classifier(X_train_trait, y_train_trait, test_trait_vector_scaled, test_target,  label_encoder=None, model_name="Trait only")
model3 = run_svm_classifier(X_train_stylo, y_train_stylo, test_stylo_np, test_target,  label_encoder=None, model_name="Stylo only")
model4 = run_svm_classifier(X_train_trait_stylo, y_train_trait_stylo, test_trait_stylo, test_target,  label_encoder=None, model_name="Trait + Stylo")
model5 = run_svm_classifier(X_train_combined, y_train_combined, test_combined, test_target,  label_encoder=None, model_name="TF-IDF + Trait")
model6 = run_svm_classifier(X_train_combined_stylo, y_train_combined_stylo, X_test_combined_stylo, test_target,  label_encoder=None, model_name="TF-IDF + Stylo")
model7 = run_svm_classifier(X_train_all, y_train_all, test_combined_stylo, test_target,  label_encoder=None, model_name="TF-IDF + Stylo + Trait")



Model: TF-IDF only
Accuracy: 0.6548
              precision    recall  f1-score   support

        ENFJ       0.67      0.26      0.38        38
        ENFP       0.76      0.59      0.66       135
        ENTJ       0.76      0.28      0.41        46
        ENTP       0.68      0.52      0.59       137
        ESFJ       0.33      0.11      0.17         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.83      0.28      0.42        18
        INFJ       0.67      0.69      0.68       294
        INFP       0.60      0.86      0.70       366
        INTJ       0.65      0.64      0.65       218
        INTP       0.65      0.84      0.74       261
        ISFJ       0.83      0.30      0.44        33
        ISFP       0.78      0.33      0.47        54
        ISTJ       0.75      0.29      0.42        41
        ISTP       0.76      0.57      0.65        67

    accuracy                           0.65