In [24]:
import pandas as pd

# Load dataset
mbti_df = pd.read_csv('mbti_1.csv')


In [25]:
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 1. Text normalization 
# (i) convert to lowercase;
# (ii) remove url; 
# (iii) remove numbers;
# (iv) remove non-alphanumeric characters (punctuation, special characters);
# (v) remove underscores and signs;
# (vi) replace multiple spaces with single spaces;
# (vii) remove stopwords; 
# (viii) remove one-letter words;
def clean_text(text):
    text = str(text)
    text = text.lower()
    pattern = re.compile(r'https?://[a-zA-Z0-9./-]*/[a-zA-Z0-9?=_.]*[_0-9.a-zA-Z/-]*')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'[0-9]')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'\W+')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'[_+]')
    text = re.sub(pattern, ' ', text)
    pattern = re.compile(r'\s+')
    text = re.sub(pattern, ' ', text).strip()
    stop_words = stopwords.words("english")
    text = " ".join([w for w in text.split() if w not in stop_words])
    text = ' '.join([word for word in text.split() if len(word) > 1])
    return text

In [26]:
# 2. Lemmatization
# (i) use NLTK's lemmatizer
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [27]:
# 3. Select classification dimension (16-class classification or binary classification on single dimension)
def select_classification_dimension(df, num=0):
    if num == 16:
        return df['type']
    elif 1 <= num <= 4:
        return df['type'].str[num-1]
    else:
        print("selection error of classification dimension!")
        return df['type']

In [28]:
# 4. Label encoding
# (i) creates an array corresponding to the type labels.
def encode_labels(column):
    le = LabelEncoder()
    y = le.fit_transform(column)
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    print(f"Label encoding mapping: {mapping}")
    print(f"Encoded label examples: {y[:5]}")
    print(f"Unique encoded labels: {np.unique(y)}")
    return y, le

In [29]:
mbti_df["posts"] = mbti_df["posts"].str.lower()       #converts text in posts to lowercase as it is preferred in nlp

In [30]:
for i in range(len(mbti_df)):
  post_temp=mbti_df._get_value(i, 'posts')
  pattern = re.compile(r'https?://[a-zA-Z0-9./-]*/[a-zA-Z0-9?=_.]*[_0-9.a-zA-Z/-]*')    #to match url links present in the post
  post_temp= re.sub(pattern, ' ', post_temp)                                            #to replace that url link with space
  mbti_df._set_value(i, 'posts',post_temp)

In [31]:
for i in range(len(mbti_df)):
  post_temp=mbti_df._get_value(i, 'posts')
  pattern = re.compile(r'[0-9]')                                    #to match numbers from 0 to 9
  post_temp= re.sub(pattern, ' ', post_temp)                        #to replace them with space
  pattern = re.compile('\W+')                                       #to match alphanumeric characters
  post_temp= re.sub(pattern, ' ', post_temp)                        #to replace them with space
  pattern = re.compile(r'[_+]')
  post_temp= re.sub(pattern, ' ', post_temp)
  mbti_df._set_value(i, 'posts',post_temp)

In [32]:
for i in range(len(mbti_df)):
  post_temp=mbti_df._get_value(i, 'posts')
  pattern = re.compile('\s+')                                     #to match multiple whitespaces
  post_temp= re.sub(pattern, ' ', post_temp)                      #to replace them with single whitespace
  mbti_df._set_value(i, 'posts', post_temp)
     

In [33]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
remove_words = stopwords.words("english")
for i in range(mbti_df.shape[0]):
  post_temp=mbti_df._get_value(i, 'posts')
  post_temp=" ".join([w for w in post_temp.split(' ') if w not in remove_words])    #to remove stopwords
  mbti_df._set_value(i, 'posts', post_temp)

In [35]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
for i in range(mbti_df.shape[0]):
  post_temp=mbti_df._get_value(i, 'posts')
  post_temp=" ".join([lemmatizer.lemmatize(w) for w in post_temp.split(' ')])   #to implement lemmetization i.e. to group together different forms of a word
  mbti_df._set_value(i, 'posts', post_temp)
     

In [37]:
print(mbti_df)

      type                                              posts
0     INFJ   enfp intj moment sportscenter top ten play pr...
1     ENTP   finding lack post alarming sex boring positio...
2     INTP   good one course say know blessing curse absol...
3     INTJ   dear intp enjoyed conversation day esoteric g...
4     ENTJ   fired another silly misconception approaching...
...    ...                                                ...
8670  ISFP   ixfp always think cat fi doms reason especial...
8671  ENFP   thread already exists someplace else post hec...
8672  INTP   many question thing would take purple pill pi...
8673  INFP   conflicted right come wanting child honestly ...
8674  INFP   long since personalitycafe although seem chan...

[8675 rows x 2 columns]


In [38]:
from sklearn.model_selection import train_test_split
train_data,test_data=train_test_split(mbti_df,test_size=0.2,random_state=42,stratify=mbti_df.type)

print(test_data)

      type                                              posts
7814  INFP   macona depends big family extroverted people ...
2233  ENFJ   blodsmak sveltihel brilliant episode regenera...
7261  INFJ   heylena lol compliment accepted thank jeesh f...
7794  INFJ   pac right rocket coffin like packed warhead r...
2950  INTJ   title thread misleading mention world dominat...
...    ...                                                ...
2006  INTJ   one sentence restrictive accurately portray d...
7137  ISTJ   wanted like odd hybrid dr james wilson house ...
6091  ENTP   took cognitive process test got cognitive pro...
2997  INFJ   get caught fantacy relationship better forget...
5458  ENTJ   doll love movie listed make think tritype one...

[1735 rows x 2 columns]


In [39]:
vectorizer=TfidfVectorizer( max_features=5000,stop_words='english')
vectorizer.fit(train_data.posts)
train_post=vectorizer.transform(train_data.posts).toarray()
test_post=vectorizer.transform(test_data.posts).toarray()

In [40]:
from sklearn.preprocessing import LabelEncoder
target_encoder=LabelEncoder()
train_target=target_encoder.fit_transform(train_data.type)
test_target=target_encoder.fit_transform(test_data.type)

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

def extract_stylometric_features(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    num_words = len(words)
    num_sentences = len(sentences)
    num_chars = len(text)
    num_exclamations = text.count('!')
    num_questions = text.count('?')
    num_uppercase_words = sum(1 for w in words if w.isupper())
    lexical_diversity = len(set(words)) / num_words if num_words > 0 else 0
    avg_word_length = np.mean([len(w) for w in words]) if words else 0
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0

    return [
        num_sentences,
        num_words,
        num_chars,
        avg_word_length,
        avg_sentence_length,
        num_exclamations,
        num_questions,
        num_uppercase_words,
        lexical_diversity
    ]

# Apply to both train and test
train_stylo = train_data["posts"].apply(extract_stylometric_features).tolist()
test_stylo = test_data["posts"].apply(extract_stylometric_features).tolist()

# Convert to numpy arrays
train_stylo_np = np.array(train_stylo)
test_stylo_np = np.array(test_stylo)

X_train_combined = np.hstack([train_post, train_stylo_np])
X_test_combined = np.hstack([test_post, test_stylo_np])


In [42]:
from collections import Counter

# Define MBTI traits
traits = ['I', 'E', 'N', 'S', 'T', 'F', 'J', 'P']

# 1: Group posts by trait to build trait_keywords
trait_groups = {trait: [] for trait in traits}
for i, row in mbti_df.iterrows():
    for t in row['type']:
        if t in traits:
            trait_groups[t].append(row['posts'])

# 2: Extract top TF-IDF keywords for each trait
def clean_tokenizer(text):
    custom_stopwords = set([
        'like', 'just', 'don', 'com', 'http', 'www', 'youtube', 'watch', 'infp', 
        'intj', 'infj', 'intp', 'enfp', 'entp', 'type', 'https', 've', 'istp'
    ])
    tokens = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    return [t for t in tokens if t not in custom_stopwords]

from sklearn.feature_extraction.text import TfidfVectorizer
trait_keywords = {}
top_k = 20

for trait, posts in trait_groups.items():
    vectorizer = TfidfVectorizer(
        tokenizer=clean_tokenizer,
        stop_words='english',
        max_features=1000
    )
    tfidf = vectorizer.fit_transform(posts)
    mean_scores = tfidf.mean(axis=0).A1
    vocab = vectorizer.get_feature_names_out()
    top_indices = mean_scores.argsort()[::-1][:top_k]
    top_words = [vocab[i] for i in top_indices]
    trait_keywords[trait] = top_words

# 3: Build transition matrix
co_matrix = np.zeros((8, 8))
trait_index = {t: i for i, t in enumerate(traits)}
for mbti in mbti_df['type']:
    chars = list(mbti)
    for t1 in chars:
        for t2 in chars:
            if t1 != t2:
                i, j = trait_index[t1], trait_index[t2]
                co_matrix[i][j] += 1
row_sums = co_matrix.sum(axis=1, keepdims=True)
transition_matrix = co_matrix / row_sums

# 4: Define vector extraction function
def extract_trait_vector(text, trait_keywords, use_transition=False, transition_matrix=None, normalize=True):
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    word_counts = Counter(words)
    base_vector = np.array([
        sum(word_counts.get(w, 0) for w in trait_keywords[t]) for t in traits
    ])
    if not use_transition:
        return base_vector
    if base_vector.sum() == 0:
        return np.zeros(8)
    if normalize:
        base_vector = base_vector / base_vector.sum()
    return np.dot(base_vector, transition_matrix)




In [43]:
# vector (using transition matrix)
train_trait_vector = np.vstack(
    train_data['posts'].apply(lambda x: extract_trait_vector(x, trait_keywords, use_transition=True, transition_matrix=transition_matrix, normalize=True))
)

test_trait_vector = np.vstack(
    test_data['posts'].apply(lambda x: extract_trait_vector(x, trait_keywords, use_transition=True, transition_matrix=transition_matrix, normalize=True))
)

In [44]:
train_combined = np.hstack([train_post, train_trait_vector])
test_combined = np.hstack([test_post, test_trait_vector])
