In [9]:
import logging
import os
import re
from typing import List, Tuple

import contractions
import emoji
import nltk
import pandas as pd
import spacy
from nltk import ngrams, pos_tag, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from nltk.chunk import RegexpParser

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('stopwords', quiet=True)

STOP_WORDS = set(stopwords.words('english'))
WNL = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm", disable=["ner"]) 
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x12bd36c50>

In [2]:
# Cleaning functions
def clean_text(text: str) -> str:
    """Clean raw text by removing URLs, emojis, mentions, and MBTI codes."""
    text = text.lower()
    text = re.sub(r'https?\S+|www\S+', '', text)
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'@\w+|#', '', text)
    text = re.sub(r"[^a-z\']", ' ', text)
    # Remove MBTI type codes (e.g., INFJ, ENTP) to avoid leaking information
    text = re.sub(r'\b(I|E)(N|S)(F|T)(J|P)(S)?\b', '', text, flags=re.IGNORECASE)
    # Remove common footer
    text = re.sub(r'\bsent (from )?my \w+(\s\w+)? using tapatalk\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'w w w', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_posts(posts_str: str) -> str:
    """Split multi-post string (separated by '|||'), clean each, and join."""
    posts = posts_str.split('|||')
    cleaned_posts = [clean_text(post) for post in posts]
    joined = ' '.join(cleaned_posts)
    return re.sub(r'\s+', ' ', joined).strip()

In [3]:
# Encoding function
def encode_mbti_type(mbti: str) -> Tuple[int, int, int, int]:
    """
    Encode MBTI type as 4 binary dimensions:
    I/E → 1/0, N/S → 1/0, F/T → 1/0, J/P → 1/0
    """
    return (
        1 if mbti[0] == 'I' else 0,
        1 if mbti[1] == 'N' else 0,
        1 if mbti[2] == 'F' else 0,
        1 if mbti[3] == 'J' else 0,
    )

In [4]:

def get_wordnet_pos(tag: str) -> str:
    """Map POS tag to WordNet format for lemmatization."""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [5]:
# Tokenization variants
def tokens_without_lemma(text: str) -> List[str]:
    fixed = contractions.fix(text)
    tokens = word_tokenize(fixed)
    filtered_tokens = [word for word in tokens if word.lower() not in STOP_WORDS]
    return filtered_tokens

def tokens_with_lemma(text: str) -> List[str]:
    fixed = contractions.fix(text)
    tokens = word_tokenize(fixed)
    filtered_tokens = [word for word in tokens if word.lower() not in STOP_WORDS]
    lemmatized = [WNL.lemmatize(word.lower()) for word in filtered_tokens]
    return lemmatized

def tokens_with_lemma_pos(text: str) -> List[str]:
    fixed = contractions.fix(text)
    tokens = word_tokenize(fixed)
    filtered_tokens = [word for word in tokens if word.lower() not in STOP_WORDS]
    pos_tags = pos_tag(filtered_tokens)
    lemmatized = [
        WNL.lemmatize(token.lower(), pos=get_wordnet_pos(pos))
        for token, pos in pos_tags
    ]
    return lemmatized

def tokens_with_spacy_dep(text: str) -> List[str]:
    doc = nlp(contractions.fix(text))
    relations = []
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == 'VERB':
                subj = next((child.text for child in token.children if child.dep_ == 'nsubj'), None)
                obj = next((child.text for child in token.children if child.dep_ in ('dobj', 'pobj')), None)
                if subj and obj:
                    relations.append(f"{subj}-{token.lemma_}-{obj}")
    return relations

def tokens_with_spacy_chunking(text: str) -> List[str]:
    doc = nlp(contractions.fix(text))
    chunks = [chunk.text for chunk in doc.noun_chunks] + [vp.text for vp in doc if vp.pos_ == 'VERB']
    return chunks

In [6]:
# N-gram generation
def generate_ngrams(tokens: List[str]) -> Tuple[List[Tuple], List[Tuple], List[Tuple]]:
    if not tokens:
        return [], [], []
    unigrams = list(ngrams(tokens, 1))
    bigrams = list(ngrams(tokens, 2))
    trigrams = list(ngrams(tokens, 3))
    return unigrams, bigrams, trigrams

In [10]:
def main():
    # Load data
    data_path = "../data/raw/mbti_1.csv"
    if not os.path.exists(data_path):
        logger.error(f"Dataset not found at {data_path}. Download from Kaggle.")
        return

    df = pd.read_csv(data_path)
    logger.info(f"Loaded dataset with {len(df)} rows.")

    # Clean posts
    logger.info("Cleaning posts...")
    tqdm.pandas()
    df['cleaned_posts'] = df['posts'].progress_apply(preprocess_posts)

    # Encode MBTI
    logger.info("Encoding MBTI types...")
    df[['IE', 'NS', 'FT', 'JP']] = df.progress_apply(
        lambda row: pd.Series(encode_mbti_type(row['type'])), axis=1
    )

    # Tokenization and n-grams for each variant
    variants = [
        ('without_lemma', tokens_without_lemma),
        ('with_lemma', tokens_with_lemma),
        ('with_lemma_pos', tokens_with_lemma_pos),
        ('with_dep_tree', tokens_with_spacy_dep),
        ('with_chunking', tokens_with_spacy_chunking)
    ]

    for var_name, token_func in variants:
        logger.info(f"Processing variant: {var_name}")
        df[f'tokens_{var_name}'] = df['cleaned_posts'].progress_apply(token_func)

        logger.info(f"Generating n-grams for {var_name}...")
        df[[f'Unigrams_{var_name}', f'Bigrams_{var_name}', f'Trigrams_{var_name}']] = df.progress_apply(
            lambda row: pd.Series(generate_ngrams(row[f'tokens_{var_name}'])), axis=1
        )

    # Split and save
    train_df, test_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df['type']
    )
    os.makedirs('../data/processed', exist_ok=True)
    train_df.to_pickle('../data/processed/train.pkl')
    test_df.to_pickle('../data/processed/test.pkl')
    logger.info("Data processed and saved.")

if __name__ == "__main__":
    main()

2025-11-22 12:12:01,100 - INFO - Loaded dataset with 8675 rows.
2025-11-22 12:12:01,102 - INFO - Cleaning posts...
100%|██████████| 8675/8675 [01:28<00:00, 98.17it/s] 
2025-11-22 12:13:29,479 - INFO - Encoding MBTI types...
100%|██████████| 8675/8675 [00:08<00:00, 1039.13it/s]
2025-11-22 12:13:37,833 - INFO - Processing variant: without_lemma
100%|██████████| 8675/8675 [00:38<00:00, 227.37it/s]
2025-11-22 12:14:15,993 - INFO - Generating n-grams for without_lemma...
100%|██████████| 8675/8675 [00:06<00:00, 1349.46it/s]
2025-11-22 12:14:22,427 - INFO - Processing variant: with_lemma
100%|██████████| 8675/8675 [01:07<00:00, 128.88it/s]
2025-11-22 12:15:29,743 - INFO - Generating n-grams for with_lemma...
100%|██████████| 8675/8675 [00:12<00:00, 690.76it/s] 
2025-11-22 12:15:42,308 - INFO - Processing variant: with_lemma_pos
100%|██████████| 8675/8675 [07:23<00:00, 19.54it/s]
2025-11-22 12:23:06,266 - INFO - Generating n-grams for with_lemma_pos...
100%|██████████| 8675/8675 [00:04<00:00,