# Step 0: Importing packages

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import os
import glob
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import zipfile
from google.colab import drive

# Step 1: Initialize tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Step 2: Load and prepare dataset from zip file and excel sheet

## From Zip File

In [None]:
def load_emotion_dataset(zip_path, extract_path='/content/drive/MyDrive/EmotionClassText/'):
    # Check if extraction folder already exists
    if not os.path.exists(extract_path):
        print(f"Extracting zip file to {extract_path}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
    else:
        print(f"Extraction folder {extract_path} already exists, skipping extraction.")

    emotions = ['surprise', 'sad', 'neutral', 'happy', 'fear', 'disgust', 'anger']
    data = []

    for emotion in emotions:
        folder_path = os.path.join(extract_path, emotion)
        if not os.path.exists(folder_path):
            print(f"Warning: Folder for {emotion} not found in extracted zip")
            continue
        files = glob.glob(os.path.join(folder_path, '*.txt'))
        for file in files:
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read().strip()
                data.append({'Phrase': text, 'Emotion': emotion})

    df = pd.DataFrame(data)
    # Map emotions to numerical labels
    emotion_to_idx = {emotion: idx for idx, emotion in enumerate(emotions)}
    df['Label'] = df['Emotion'].map(emotion_to_idx)
    return df, emotions

## From Excel Sheet

In [None]:
def load_emotion_dataset(excel_path):
    # Read Excel file
    if not os.path.exists(excel_path):
        raise FileNotFoundError(f"Excel file not found at {excel_path}")

    df = pd.read_excel(excel_path)

    # Verify required columns
    if 'Phrase' not in df.columns or 'Emotion' not in df.columns:
        raise ValueError("Excel file must contain 'Phrase' and 'Emotion' columns")

    # Define emotions in the specified order
    expected_emotions = ['surprise', 'sad', 'neutral', 'happy', 'fear', 'disgust', 'anger']

    # Get unique emotions from the dataset
    unique_emotions = df['Emotion'].unique()
    # Validate emotions
    if not all(emotion in expected_emotions for emotion in unique_emotions):
        print(f"Warning: Found unexpected emotions in dataset: {set(unique_emotions) - set(expected_emotions)}")

    # Use expected emotions as classes to maintain consistent order
    classes = expected_emotions

    # Map emotions to numerical labels
    emotion_to_idx = {emotion: idx for idx, emotion in enumerate(classes)}
    df['Label'] = df['Emotion'].map(emotion_to_idx)

    # Drop any rows with missing labels (if emotion not in expected_emotions)
    df = df.dropna(subset=['Label'])

    return df, classes

# Step 3: Mount Google Drive and load dataset

## Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Loading Data from Google Drive

### From Zip File

In [None]:
zip_path = '/content/drive/MyDrive/EmotionClassText.zip'
df, classes = load_emotion_dataset(zip_path)

### From Excel Sheet

In [None]:
excel_path = '/content/drive/MyDrive/EmotionText.xlsx' 
df, classes = load_emotion_dataset(excel_path)

## Ensuring Data is loaded and checking dataset

In [None]:
df.info()
print(df.head())
df['Emotion'].value_counts()
print(df)

# Step 4: Data Preprocessing and tokenization functions

## Preparing Data

In [None]:
def prepare_data(text, tokenizer):
    token = tokenizer.encode_plus(
        text,
        max_length=256,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.int32),
        'attention_mask': tf.cast(token.attention_mask, tf.int32)
    }

## Generating Training Data

In [None]:
def generate_training_data(df, tokenizer):
    input_ids = np.zeros((len(df), 256), dtype=np.int32)
    attn_masks = np.zeros((len(df), 256), dtype=np.int32)

    for i, text in tqdm(enumerate(df['Phrase']), total=len(df)):
        tokenized = prepare_data(text, tokenizer)
        input_ids[i, :] = tokenized['input_ids']
        attn_masks[i, :] = tokenized['attention_mask']

    labels = np.zeros((len(df), 7), dtype=np.int32)
    labels[np.arange(len(df)), df['Label'].values] = 1
    return input_ids, attn_masks, labels

# Step 5: Processing Data

In [None]:
input_ids, attn_masks, labels = generate_training_data(df, tokenizer)

# Step 6: Creating TensorFlow dataset for efficient training

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((input_ids, attn_masks, labels))
dataset = dataset.map(lambda x, y, z: ({'input_ids': x, 'attention_mask': y}, z))
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True)

# Step 7: Split data into training and validation sets

In [None]:
p = 0.8
train_size = int((len(df)//16) * p)
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

# Step 8: Build the neural network model

In [None]:
def build_model():
    input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
    attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

    bert = TFBertModel.from_pretrained('bert-base-cased')
    bert_embds = bert(input_ids, attention_mask=attn_masks)[1]
    intermediate = tf.keras.layers.Dense(512, activation='relu', name='intermediate')(bert_embds)
    output = tf.keras.layers.Dense(7, activation='softmax', name='output')(intermediate)

    model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=[tf.keras.metrics.CategoricalAccuracy('accuracy')]
    )
    return model

# Step 9: Create and train model

## Building Model

In [None]:
model = build_model()

## Training Model

In [None]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
)

# Step 10: Save the trained model

## Colab Model Saving

In [None]:
model.save('MCTC_Emotion.keras')

## Google Drive Model Saving

In [None]:
model.save("/content/drive/MyDrive/MCTC_Emotion5epoch.h5")

# Step 11: Create Prediction function

In [None]:
def predict_emotion(text, model, tokenizer):
    inputs = prepare_data(text, tokenizer)
    probs = model.predict([inputs['input_ids'], inputs['attention_mask']])[0]
    return classes[np.argmax(probs)], probs

# Step 12: Test the model with example phrases

## Testing Material

In [None]:
test_phrases = [
    #"I love this!",
    #"This is terrible",
    #"It's okay, I guess",
    #"I really Hate this",
    #"Wow I didn't see that coming",
    #"This make me wanna vomit",
    #"My body got colder hearing about that",
    #"happy",
    #"sad",
    #"angry",
    #"disgust",
    #"surprise",
    #"fear",
    #"neutral",
    "Oh my God!!",#surprise
    "I didn’t make it!",#sad
    "You know how much I love listening to your music, you know,    but...",#neutral
    "Alright, you did it! Do we have any fruit?",#happy
    "i feel insecure and useless",#fear
    "I reached into the leper colony and felt a fungal decomposing rat cling to my hair, amid the hum of bloated mosquitoes.",#disgust
    "Did it ever occur to you that I might just be that stupid?"#angry
]

## Testing Setup & Executing

In [None]:
print(classes)
for phrase in test_phrases:
    pred_class, probs = predict_emotion(phrase, model, tokenizer)
    print(f"\n'{phrase}':")
    print(f"Predicted: {pred_class}")
    print(f"Probabilities: {[f'{p:.4f}' for p in probs]}")

# Step 13: Vizualize training process

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Accuracy Curve")
plt.legend()
plt.grid(True)
plt.show()

plt.plot(history.history['loss'], label='Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.legend()
plt.grid(True)
plt.show()