# Email Classification with LSTM/GRU and GLEU Evaluation

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.translate.gleu_score import corpus_gleu
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, GRU, Dense, Dropout

## Data Loading & Preprocessing

In [None]:
# Load dataset (expected format: CSV with 'Text' and 'Target' columns)
data = pd.read_csv('email_data.csv')
texts = data['Text'].values
labels = data['Target'].values

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Apply cleaning
texts = [clean_text(t) for t in texts]

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=100)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2)

## Model Architectures

In [None]:
# LSTM Model
lstm_model = Sequential([
    Embedding(5000, 128),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(loss='binary_crossentropy', 
                  optimizer='adam',
                  metrics=['accuracy'])

# GRU Model
gru_model = Sequential([
    Embedding(5000, 128),
    GRU(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

gru_model.compile(loss='binary_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])

## Training

In [None]:
# Train LSTM
lstm_history = lstm_model.fit(X_train, y_train,
                            batch_size=64,
                            epochs=5,
                            validation_split=0.2)

# Train GRU
gru_history = gru_model.fit(X_train, y_train,
                           batch_size=64,
                           epochs=5,
                           validation_split=0.2)

## Evaluation with GLEU Score

In [None]:
def evaluate_model(model, X_test, y_test):
    # Standard metrics
    loss, accuracy = model.evaluate(X_test, y_test)
    
    # GLEU score calculation
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    references = [[str(label)] for label in y_test]
    hypotheses = [str(pred[0]) for pred in y_pred]
    
    gleu_score = corpus_gleu([[r] for r in references], hypotheses)
    
    return accuracy, gleu_score

# Evaluate both models
lstm_accuracy, lstm_gleu = evaluate_model(lstm_model, X_test, y_test)
gru_accuracy, gru_gleu = evaluate_model(gru_model, X_test, y_test)

# Display results
results = pd.DataFrame({
    'Model': ['LSTM', 'GRU'],
    'Accuracy': [lstm_accuracy, gru_accuracy],
    'GLEU Score': [lstm_gleu, gru_gleu]
})

print(results)