In [None]:
!pip install vaderSentiment

In [None]:
!pip install pyLDAvis==3.4.1

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!unzip mbti-personality-type-twitter-dataset.zip

In [None]:
!mv /mbti-personality-type-twitter-dataset.zip /content/

In [None]:
!pip install nltk gensim pyLDAvis
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
import re
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Load data using the correct filename with quotes
data = pd.read_csv('/content/twitter_MBTI.csv')

# Enhanced cleaning with stop word removal and lemmatization
stop_words = set(stopwords.words('english'))

# Add Filipino stop words
filipino_stop_words = set(['ako', 'ikaw', 'siya', 'tayo', 'kayo', 'sila',
                           'ang', 'ng', 'sa', 'mga', 'ay', 'na', 'at',
                           'ni', 'para', 'kay', 'rin', 'din', 'dito',
                           'doon', 'kanya', 'kanila'])  # Add more Filipino stop words as needed
stop_words.update(filipino_stop_words)

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()  # Lowercase
    tokens = text.split()  # Tokenize
    tokens = [lemmatizer.lemmatize(token) for token in tokens
              if token not in stop_words and len(token) > 2]  # Lemmatize and remove short words
    return tokens

data['cleaned_tokens'] = data['text'].apply(clean_text)

# Create dictionary and corpus
dictionary = corpora.Dictionary(data['cleaned_tokens'])
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Filter words appearing in less than 5 documents or more than 50% of documents
corpus = [dictionary.doc2bow(text) for text in data['cleaned_tokens']]

# Build LDA model
num_topics = 3
lda_model = LdaModel(corpus=corpus,
                        id2word=dictionary,
                        num_topics=num_topics,
                        passes=15,
                        alpha=0.1,
                        eta=0.005)

# Visualize topics using pyLDAvis
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [None]:
data.rename(columns={'label': 'type'}, inplace=True)

topic_distribution = []
for index, row in data.iterrows():
    bow = dictionary.doc2bow(row['cleaned_tokens'])
    topic_probs = lda_model.get_document_topics(bow)
    topic_distribution.append({'type': row['type'], **{f'topic_{i}': prob for i, prob in topic_probs}})

topic_mbti_df = pd.DataFrame(topic_distribution).fillna(0)  # Fill missing topics with 0 probability
topic_mbti_df = topic_mbti_df.groupby('type').mean()  # Get average topic probabilities for each MBTI type

print(topic_mbti_df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a heatmap to visualize MBTI type and topic association
plt.figure(figsize=(12, 6))  # Adjust figure size as needed
sns.heatmap(topic_mbti_df, annot=True, cmap="YlGnBu", fmt=".3f")
plt.title("MBTI Type and Topic Association")
plt.xlabel("Topic")
plt.ylabel("MBTI Type")
plt.show()

In [None]:
!pip install nltk
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

data['sentiment'] = data['text'].apply(lambda text: analyzer.polarity_scores(text)['compound'])

# Group by MBTI type and calculate average sentiment
sentiment_by_mbti = data.groupby('type')['sentiment'].mean()

# Print the results
print(sentiment_by_mbti)

# Visualize using a bar plot
import matplotlib.pyplot as plt
sentiment_by_mbti.plot(kind='bar', figsize=(10, 6))
plt.title('Average Sentiment by MBTI Type using VADER')
plt.xlabel('MBTI Type')
plt.ylabel('Average Sentiment Score')
plt.show()