<a href="https://colab.research.google.com/github/MaddiSathwika/PersonalityPrediction/blob/main/GRU%2BLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data=pd.read_csv('/content/drive/My Drive/MBTI_500.csv')
data.head()


Mounted at /content/drive


Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ


In [None]:
X = data['posts']
y = data['type']

In [None]:
import joblib

In [None]:
# Label encoding for the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Save the LabelEncoder to a file
joblib.dump(label_encoder, 'label_encoder1.joblib')

# Load the LabelEncoder from the file
loaded_label_encoder = joblib.load('label_encoder1.joblib')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [None]:
# Tokenize and pad the input text
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_length = max(max(len(seq) for seq in X_train_sequences), max(len(seq) for seq in X_test_sequences))
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post')


In [None]:
# One-hot encode the target variable
y_train_one_hot = to_categorical(y_train)
y_test_one_hot = to_categorical(y_test)

In [None]:
# Build and train the LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=5000, output_dim=50, input_length=max_length))
lstm_model.add(LSTM(units=50, return_sequences=True))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_padded,y_train_one_hot, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10

KeyboardInterrupt: ignored

In [None]:
# Evaluate the LSTM model on the test set
lstm_predictions = lstm_model.predict(X_test_padded)
lstm_predictions_classes = np.argmax(lstm_predictions, axis=1)

In [None]:
# Build and train the GRU model
gru_model = Sequential()
gru_model.add(Embedding(input_dim=5000, output_dim=50, input_length=max_length))
gru_model.add(GRU(units=100, return_sequences=True))  # Adjusted size to 100 units
gru_model.add(GRU(units=100))  # Adjusted size to 100 units
gru_model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))
gru_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
gru_model.fit(X_train_padded, y_train_one_hot, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
# Evaluate the GRU model on the test set
gru_predictions = gru_model.predict(X_test_padded)
gru_predictions_classes = np.argmax(gru_predictions, axis=1)

In [None]:
# Build the Hybrid model by stacking both LSTM and GRU models
hybrid_model = Sequential()
hybrid_model.add(Embedding(input_dim=5000, output_dim=50, input_length=max_length))
# Stack LSTM and GRU layers
hybrid_model.add(LSTM(units=50, return_sequences=True))
hybrid_model.add(GRU(units=100, return_sequences=False))

# Dense layer for classification
hybrid_model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

# Compile the hybrid model
hybrid_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the hybrid model
hybrid_model.fit(X_train_padded, y_train_one_hot, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the hybrid model on the test set
hybrid_predictions = hybrid_model.predict(X_test_padded)
hybrid_predictions_classes = np.argmax(hybrid_predictions, axis=1)

In [None]:
# Evaluate the combined model
combined_accuracy = accuracy_score(y_test, hybrid_predictions_classes)
combined_classification_report = classification_report(y_test, hybrid_predictions_classes, target_names=label_encoder.classes_)

print("LSTM Model Results:")
print(f"Accuracy: {accuracy_score(y_test, lstm_predictions_classes)}")
print(f"Classification Report:\n{classification_report(y_test, lstm_predictions_classes, target_names=label_encoder.classes_)}")

print("\nGRU Model Results:")
print(f"Accuracy: {accuracy_score(y_test, gru_predictions_classes)}")
print(f"Classification Report:\n{classification_report(y_test, gru_predictions_classes, target_names=label_encoder.classes_)}")

print("\nCombined Model Results:")
print(f"Accuracy: {combined_accuracy}")
print(f"Classification Report:\n{combined_classification_report}")

In [None]:
import pickle

In [None]:
pickle.dump(hybrid_model,open('/content/Integrated_model','wb'))

In [None]:
model_loaded1=pickle.load(open('/content/Integrated_model','rb'))

In [None]:
# le = LabelEncoder()
# le.classes_ = np.load('/content/label_encoder.joblib',allow_pickle=True)

In [None]:
loaded_label_encoder = joblib.load('/content/label_encoder1.joblib')

In [None]:
def predict_personality_type(new_posts, tokenizer, model, max_len, loaded_label_encoder):
    # Tokenize and pad the new posts
    new_posts_seq = tokenizer.texts_to_sequences([new_posts])
    new_posts_pad = pad_sequences(new_posts_seq, maxlen=max_len, padding='post')

    # Make predictions
    predictions = model.predict(new_posts_pad)

    # Get the index of the maximum probability
    predicted_index = np.argmax(predictions, axis=1)[0]

    # Convert the index to personality type using the loaded_label_encoder
    predicted_personality = loaded_label_encoder.inverse_transform([predicted_index])

    return predicted_personality[0]


In [None]:
new_posts = "arthritis really like outside like fact ni te loop go crazy love idea relationship come think provoke amphetamine like meth speed like cause get really smart write three lecture one night invent self detonate smoke"
max_len = max(max(len(seq) for seq in X_train_sequences), max(len(seq) for seq in X_test_sequences))
predicted_type = predict_personality_type(new_posts, tokenizer, model_loaded1, max_len, loaded_label_encoder)
print(f'Predicted Personality Type: {predicted_type}')

In [None]:
new_posts = "cram much test hand school picture anything take full body picture people usually mostly hide bore nothing special every person ever meet go lose everything love world would one want anything kind meantime r"
max_len = max(max(len(seq) for seq in X_train_sequences), max(len(seq) for seq in X_test_sequences))
predicted_type = predict_personality_type(new_posts, tokenizer, model_loaded1, max_len, loaded_label_encoder)
print(f'Predicted Personality Type: {predicted_type}')