In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import string
import nltk
from nltk.stem import PorterStemmer

In [93]:
# Install necessary libraries
!pip install -U scikit-learn
!pip install xgboost
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [94]:
# Upload your GoEmotions dataset to Colab (replace 'your_dataset.csv' with your actual dataset file)
from google.colab import files

uploaded = files.upload()

Saving go_emotions_dataset.csv to go_emotions_dataset.csv


In [95]:
# Read the uploaded dataset
import io
df = pd.read_csv(io.BytesIO(uploaded['go_emotions_dataset.csv']))

In [96]:
# Assuming 'text' column and emotions start from the 4th column
X = df['text']
y = df.iloc[:, 3:]  # Select columns starting from the 4th column

In [97]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y.idxmax(axis=1))  # Encode the emotion with the highest value

In [98]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [99]:
# Custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

    def fit(self, X, y=None):
        # Fit the TF-IDF vectorizer during the fit step
        self.tfidf_vectorizer.fit(X)
        return self

    def transform(self, X):
        # Tokenization
        tokens = X.apply(nltk.word_tokenize)

        # Stop Word Removal
        stop_words = set(nltk.corpus.stopwords.words('english'))
        tokens = tokens.apply(lambda token_list: [token for token in token_list if token.lower() not in stop_words])

        # Remove Punctuation
        tokens = tokens.apply(lambda token_list: [token for token in token_list if token not in string.punctuation])

        # Word Stemming
        tokens = tokens.apply(lambda token_list: [self.stemmer.stem(token) for token in token_list])

        # Vectorization (TF-IDF)
        vectorized_text = self.tfidf_vectorizer.transform(X)

        return vectorized_text

In [100]:
# Train XGBoost model
xgb_model = XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_))

In [101]:
# Create a pipeline for the entire process
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('classifier', xgb_model)
])

In [102]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [103]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

In [104]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.4033


In [112]:
# Example: Predict emotion for a new text
new_text = ""

# Preprocess the new text using the transform method
new_text_preprocessed = pipeline.named_steps['preprocessor'].transform(pd.Series([new_text]))

# Make predictions on the preprocessed text
new_text_vectorized = pipeline.named_steps['classifier'].predict(new_text_preprocessed)

# Convert the predicted label to the corresponding emotion class
predicted_emotion = label_encoder.classes_[new_text_vectorized[0]]
print(f"Predicted Emotion: {predicted_emotion}")

Predicted Emotion: neutral


In [117]:
# Example: Predict top N emotions for a new text
new_text = "I feel really excited about this project!"

# Preprocess the new text using the transform method
new_text_preprocessed = pipeline.named_steps['preprocessor'].transform(pd.Series([new_text]))

# Get the top N predicted emotions along with their probabilities
N =28
top_n_probs = pipeline.named_steps['classifier'].predict_proba(new_text_preprocessed)
top_n_emotions_indices = top_n_probs.argsort(axis=1)[:, -N:][:, ::-1]

# Convert the predicted indices to the corresponding emotion classes and their probabilities
top_n_results = [(label_encoder.classes_[i], top_n_probs[0, i]) for i in top_n_emotions_indices[0]]

# Display the top N predicted emotions
print(f"Top {N} Predicted Emotions:")
for emotion, probability in top_n_results:
    print(f"Emotion: {emotion}, Probability: {probability:.4f}")


Top 28 Predicted Emotions:
Emotion: excitement, Probability: 0.7368
Emotion: neutral, Probability: 0.0791
Emotion: approval, Probability: 0.0223
Emotion: admiration, Probability: 0.0175
Emotion: annoyance, Probability: 0.0169
Emotion: curiosity, Probability: 0.0124
Emotion: disappointment, Probability: 0.0121
Emotion: disapproval, Probability: 0.0114
Emotion: caring, Probability: 0.0102
Emotion: desire, Probability: 0.0099
Emotion: anger, Probability: 0.0089
Emotion: confusion, Probability: 0.0083
Emotion: realization, Probability: 0.0076
Emotion: sadness, Probability: 0.0073
Emotion: optimism, Probability: 0.0049
Emotion: joy, Probability: 0.0047
Emotion: amusement, Probability: 0.0044
Emotion: fear, Probability: 0.0039
Emotion: nervousness, Probability: 0.0037
Emotion: disgust, Probability: 0.0036
Emotion: surprise, Probability: 0.0034
Emotion: embarrassment, Probability: 0.0020
Emotion: gratitude, Probability: 0.0019
Emotion: relief, Probability: 0.0019
Emotion: love, Probability: 0