In [1]:
!pip install emot
!pip install contractions

Collecting emot
  Downloading emot-3.1-py3-none-any.whl.metadata (396 bytes)
Downloading emot-3.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emot
Successfully installed emot-3.1
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (

In [2]:
import numpy as np
import pandas as pd
import re
import emot
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.utils import tokenize as gensim_tokenize
import contractions
import pickle

# Configuration
pd.options.display.max_rows = 250
pd.options.display.max_colwidth = 200
contractions.add('e.g.', 'for example')
contractions.add('T.G.I.F.', 'thank God it is Friday')


In [3]:
# Emoji and emoticon conversion
UNICODE_EMO = {v: k[1:-1] for v, k in emot.UNICODE_EMOJI.items()}

def convert_emojis(text):
    for emoji, meaning in UNICODE_EMO.items():
        text = text.replace(emoji, meaning)
    return text

def convert_emoticons(text):
    for emoticon, meaning in emot.EMOTICONS_EMO.items():
        emoticon = re.escape(emoticon)
        text = re.sub(emoticon, meaning, text)
    return text

# Text preprocessing
def process_and_tokenize(text):
    """
    1. Replace URLs in text
    2. Replace emojis and emoticons with their meanings
    3. Replace contractions with expanded forms
    4. Tokenize text
    """
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'url', text).strip()
    text = convert_emojis(text)
    text = convert_emoticons(text)
    text = contractions.fix(text)
    return list(gensim_tokenize(text, lower=True))

# Preprocess a new input text
def preprocess_text(text, vocab):
    tokenized = process_and_tokenize(text)
    token_idxs = [vocab[word] for word in tokenized if word in vocab]
    token_idxs_padded = pad_sequences([token_idxs], maxlen=40, value=0., padding='pre', truncating='pre')
    return token_idxs_padded

# Load vocabulary and models
with open('/kaggle/input/my-vocab/vocab.pkl', 'rb') as pkl_file:
    vocab = pickle.load(pkl_file)


In [4]:
# Predict MBTI type
one_slice = ['I', 'S', 'F', 'J']  # MBTI indicators
opposite = ['E', 'N', 'T', 'P']   # Opposite indicators

In [5]:
model_1 = tf.keras.models.load_model('/kaggle/input/all_model/tensorflow2/default/1/GloVe_GRU.h5')
model_2 = tf.keras.models.load_model('/kaggle/input/all_model/tensorflow2/default/1/GloVe_LSTM.h5')
model_3 = tf.keras.models.load_model('/kaggle/input/all_model/tensorflow2/default/1/Word2Vec_GRU.h5')
model_4 = tf.keras.models.load_model('/kaggle/input/all_model/tensorflow2/default/1/Word2Vec_LSTM.h5')



In [6]:
text = "read"
processed_text = preprocess_text(text, vocab)

In [7]:
# Dictionary to store model predictions and their confidence scores
predictions = {}

for i, model in enumerate([model_1, model_2, model_3, model_4], 1):
    # Get prediction probabilities
    prediction = model.predict(processed_text)[0]
    
    # Calculate confidence score as the mean probability across all four MBTI dimensions
    confidence_score = np.mean(prediction)
    
    # Determine MBTI type based on probability thresholds (0.5)
    mbti_type = ''.join([one if prob > 0.5 else opp for one, opp, prob in zip(one_slice, opposite, prediction)])
    
    # Store the MBTI type and its confidence score
    predictions[mbti_type] = confidence_score
    print(f"Model {i} - Predicted MBTI type: {mbti_type} with confidence: {confidence_score:.4f}")

# Find the MBTI type with the highest confidence score
best_mbti_type = max(predictions, key=predictions.get)
best_confidence = predictions[best_mbti_type]

# Output the best MBTI type with the highest confidence
print(f"\nMBTI type with the highest confidence: {best_mbti_type} (Confidence: {best_confidence:.4f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388ms/step
Model 1 - Predicted MBTI type: ISFJ with confidence: 0.6840
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step
Model 2 - Predicted MBTI type: ISTJ with confidence: 0.6660
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 302ms/step
Model 3 - Predicted MBTI type: ISTJ with confidence: 0.6790
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
Model 4 - Predicted MBTI type: ISTJ with confidence: 0.6761

MBTI type with the highest confidence: ISFJ (Confidence: 0.6840)


In [8]:
import numpy as np
import tensorflow as tf
import pandas as pd

def analyze_model_predictions(model, processed_text, one_slice=['I', 'S', 'F', 'J'], opposite=['E', 'N', 'T', 'P']):
    """
    Analyze the raw predictions of the model for each dimension.
    """
    # Get raw predictions
    predictions = model.predict(processed_text)[0]
    
    # Analyze each dimension
    for dim_idx, (one, opp, pred) in enumerate(zip(one_slice, opposite, predictions)):
        print(f"\nDimension {dim_idx + 1} ({one}/{opp}):")
        print(f"Raw prediction value: {pred:.4f}")
        print(f"Predicted class: {one if pred > 0.5 else opp}")
        print(f"Confidence: {max(pred, 1-pred):.4f}")

def test_model_with_diverse_inputs(model, vocab, texts):
    """
    Test the model with diverse inputs to check for variety in predictions.
    """
    results = []
    for text in texts:
        processed = preprocess_text(text, vocab)
        pred = model.predict(processed)[0]
        mbti = ''.join([one if p > 0.5 else opp 
                       for one, opp, p in zip(one_slice, opposite, pred)])
        results.append({
            'text': text,
            'prediction': mbti,
            'raw_values': pred
        })
    return pd.DataFrame(results)

# Example usage:
print("Detailed analysis of Model 1:")
analyze_model_predictions(model_1, processed_text)

# Test with diverse inputs
test_texts = [
    "I love spending time alone reading books and thinking deeply",  # Introverted
    "I enjoy parties and meeting new people",                       # Extroverted
    "I focus on concrete facts and details",                        # Sensing
    "I enjoy thinking about abstract theories",                     # Intuitive
    "I make decisions based on logic and analysis",                # Thinking
    "I make decisions based on feelings and values"                # Feeling
]

results_df = test_model_with_diverse_inputs(model_1, vocab, test_texts)
print("\nResults with diverse inputs:")
print(results_df[['text', 'prediction']])

# Add model inspection
print("\nModel Architecture:")
model_1.summary()

# Check output layer activation
output_layer = model_1.layers[-1]
print(f"\nOutput layer activation: {output_layer.activation.__name__}")

Detailed analysis of Model 1:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step

Dimension 1 (I/E):
Raw prediction value: 0.7594
Predicted class: I
Confidence: 0.7594

Dimension 2 (S/N):
Raw prediction value: 0.8494
Predicted class: S
Confidence: 0.8494

Dimension 3 (F/T):
Raw prediction value: 0.5121
Predicted class: F
Confidence: 0.5121

Dimension 4 (J/P):
Raw prediction value: 0.6153
Predicted class: J
Confidence: 0.6153
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step

Results with diverse inputs:
                                                           text prediction
0  I love spe


Output layer activation: sigmoid
