In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, split, array_join, expr, lower
!pip install numpy
from pyspark.sql.types import StringType
import pyspark.sql.functions as F
! pip install scikit-learn
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from transformers import DistilBertTokenizer
import torch
from sklearn.model_selection import train_test_split


spark = SparkSession.builder \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "2g") \
    .appName("BERTDataPreprocessing") \
    .getOrCreate()







In [28]:
#df_comments_MB = spark.read.csv("./mbti9k_comments.csv")  # prob wont use this 
#df_post = spark.read.csv("./typed_posts.csv") # this has numeric data which is not suitable for distilled bert, might try it with differenr model if time allows

# Renaming columns in dataset mbti_1.csv
df_mbti_1 = spark.read.csv("mbti_1.csv", header=True, inferSchema=True)

# Renaming columns in dataset typed_comments.csv
df_comments_typed = spark.read.csv("typed_comments.csv", header=True, inferSchema=True)


print(f"The DataFrame has {df_comments_typed.count()} rows.")




The DataFrame has 22934562 rows.


                                                                                

In [37]:

# UDF to clean and filter posts
def clean_and_filter_posts(post):
    posts = post.split('|||')
    cleaned_posts = [
        ' '.join([word for word in p.split() if not word.startswith('http')])  # Remove links
        for p in posts if len(p.split()) >= 5  # Keep posts with 5 or more words
    ]
    return '|||'.join(cleaned_posts)

# Apply the cleaning UDF to the post column in df_mbti_1
df_cleaned = df_mbti_1.withColumn("posts", udf(clean_and_filter_posts, StringType())(col("posts")))


# UDF to combine posts within a 512-token limit (~2560 words)
def combine_posts(post):
    posts = post.split('|||')
    posts = sorted(posts, key=len, reverse=True)
    max_length = 2560  # Approximate word limit
    combined = ""
    
    for p in posts:
        if len(combined.split()) + len(p.split()) <= max_length:
            combined += " " + p
        else:
            break
    
    return combined.strip()

# Apply the UDF to combine cleaned posts in df_cleaned
df_combined = df_cleaned.withColumn("post_combined", udf(combine_posts, StringType())(col("posts")))

# Keep only the necessary columns for training
df_final = df_combined.select("type", "post_combined")
df_final = df_final.withColumn('type', lower(df_final['type']))


df_final.write.csv("processed_mbti_dataset.csv", header=True, mode="overwrite")


+----+--------------------+
|type|       post_combined|
+----+--------------------+
|infj|Prozac, wellbruti...|
|entp|Well charlie I wi...|
|intp|So-called Ti-Si l...|
|intj|Very seriously to...|
|entj|I probably would ...|
|intj|There are as many...|
|infj|I think this suck...|
|intj|sometimes i look ...|
|infj|Help or a voice o...|
|intp|definitely Walter...|
|infj|The dirty people ...|
|enfj|I went through a ...|
|infj|As an Fe user, I ...|
|intj|'Fair enough, if ...|
|intp|I enjoy that all ...|
|intp|I saw that earlie...|
|infj|My senses are all...|
|infp|I appreciated tha...|
|infj|I empathize with ...|
|infp|At this, Job got ...|
+----+--------------------+
only showing top 20 rows



                                                                                

In [33]:

# Valid MBTI types (lowercase)
valid_types = ["intj", "intp", "entj", "entp", "infj", "infp", "enfj", "enfp",
               "istj", "isfj", "estj", "esfj", "istp", "isfp", "estp", "esfp"]


# Filter rows
df_comments_typed = (
    df_comments_typedc
    .filter(col("comment").isNotNull() & (col("comment") != ""))  # Non-empty comments
    .filter(lower(col("type")).isin(valid_types))  # Valid MBTI types
    .filter(col("lang") == "en")  # English comments only
    .filter(col("word_count") > 5)  # Minimum word count
)


df_comments_typed = df_comments_typed.select("type", "comment")
print(df_comments_typed.count())




11773174


                                                                                

In [46]:
! pip install scikit-learn
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from transformers import DistilBertTokenizer
import torch
from sklearn.model_selection import train_test_split
from pyspark.ml.feature import StringIndexer, OneHotEncoder

from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Drop "type_indexed" and "type_encoded" columns if they already exist
for col_name in ["type_indexed", "type_encoded"]:
    if col_name in df_comments_typed.columns:
        df_comments_typed = df_comments_typed.drop(col_name)

# Apply StringIndexer
indexer = StringIndexer(inputCol="type", outputCol="type_indexed", handleInvalid="keep")
df_comments_typed = indexer.fit(df_comments_typed).transform(df_comments_typed)

# Apply OneHotEncoder
encoder = OneHotEncoder(inputCol="type_indexed", outputCol="type_encoded")
df_comments_typed = encoder.fit(df_comments_typed).transform(df_comments_typed)

# Convert Spark DataFrame to Pandas
df_pandas = df_comments_typed.select("comment", "type_encoded").limit(5000).toPandas()
df_pandas["type_encoded"] = df_pandas["type_encoded"].apply(lambda x: x.toArray())
df_pandas["class_label"] = df_pandas["type_encoded"].apply(lambda x: x.argmax())

# Train-Test Split
train_df, test_df = train_test_split(df_pandas, test_size=0.1, stratify=df_pandas["class_label"], random_state=42)

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_text(df): 
    return tokenizer(df["comment"].tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")

train_encodings, test_encodings = map(tokenize_text, [train_df, test_df])
import numpy as np
import torch

# Convert Pandas arrays to proper NumPy float32 before creating Torch tensors
train_labels, test_labels = map(
    lambda df: torch.tensor(np.stack(df["type_encoded"].values).astype(np.float32)), 
    [train_df, test_df]
)

# Save datasets
for name, enc, lbl in [("train", train_encodings, train_labels), ("test", test_encodings, test_labels)]:
    torch.save({"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"], "labels": lbl}, f"{name}_data.pth", _use_new_zipfile_serialization=False)

print("✅ Datasets saved as train_data.pth & test_data.pth")




                                                                                

✅ Datasets saved as train_data.pth & test_data.pth


In [6]:
# introvert extrivert separation 