### Emotion Analysis

#### Import Libraries

In [None]:
import pandas as pd
import re
import emoji
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import tensorflow as tf
from keras import layers, models
# from tensorflow.keras import layers, models

#### Load and Explore Data

In [None]:
def load_data(file_path):
    """Load the dataset from a file."""
    return pd.read_csv(file_path)

In [None]:
def explore_data(df, text_column='text'):
    """
    Perform an extensive exploration of the dataset to check data cleanliness.

    Parameters:
    - df: Pandas DataFrame
    - text_column: Name of the column containing text data

    Returns:
    - Summary of findings
    """
    print("\n--- Basic Information ---")
    print(f"Dataset Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Missing Values:\n{df.isnull().sum()}")
    print(f"Duplicate Rows: {df.duplicated().sum()}")

    print("\n--- Class Distribution ---")
    if 'label' in df.columns:
        print(df['label'].value_counts())
    else:
        print("No 'label' column found!")

    print("\n--- Text Analysis ---")
    # Check for empty or blank text
    empty_texts = df[text_column].isnull().sum() + df[text_column].str.strip().eq('').sum()
    print(f"Empty or Blank Texts: {empty_texts}")

    # Check for punctuation
    punctuations = df[text_column].apply(lambda x: len(re.findall(r'[^\w\s]', str(x))))
    print(f"Average Punctuation Count per Entry: {punctuations.mean():.2f}")

    # Check for emojis
    emojis = df[text_column].apply(lambda x: len(emoji.emoji_list(str(x))))
    print(f"Average Emoji Count per Entry: {emojis.mean():.2f}")

    # Check for stop words
    stop_words = set(stopwords.words('english'))
    stop_word_counts = df[text_column].apply(lambda x: len([word for word in str(x).split() if word.lower() in stop_words]))
    print(f"Average Stop Words per Entry: {stop_word_counts.mean():.2f}")

    # Check for special characters (non-alphanumeric)
    special_chars = df[text_column].apply(lambda x: len(re.findall(r'[^\w\s]', str(x))))
    print(f"Average Special Characters per Entry: {special_chars.mean():.2f}")

    print("\n--- Recommendations ---")
    recommendations = []
    if empty_texts > 0:
        recommendations.append(f"Remove or handle {empty_texts} empty or blank entries.")
    if df.duplicated().sum() > 0:
        recommendations.append("Remove duplicate rows.")
    if emojis.mean() > 0:
        recommendations.append("Consider handling emojis (e.g., replace with words or remove).")
    if punctuations.mean() > 0:
        recommendations.append("Remove or handle punctuation marks appropriately.")
    if special_chars.mean() > 0:
        recommendations.append("Clean special characters from text.")

    if recommendations:
        print("\n".join(recommendations))
    else:
        print("The dataset appears clean!")

In [None]:
#Drop empty rows from our data
def drop_empty_rows(df):
    """Drop rows with empty text values."""
    return df.dropna(subset=['text']).reset_index(drop=True)

#### Preprocess Text

In [6]:
def remove_emojis(text):
    """Remove emojis from text."""
    return emoji.replace_emoji(text, replace="")

def remove_punctuation_and_symbols(text):
    """Remove punctuation, numbers, and special characters."""
    return re.sub(r'[^a-zA-Z\s]', '', text)

def preprocess_text(text):
    """Clean and preprocess text."""
    text = remove_emojis(text)
    text = remove_punctuation_and_symbols(text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

def apply_preprocessing(df):
    """Apply preprocessing to the text column."""
    df['text'] = df['text'].apply(preprocess_text)
    return df


#### Feature Engineering

In [7]:
def create_features(corpus):
    """Convert text into numerical representations."""
    vectorizer = TfidfVectorizer(max_features=5000)
    features = vectorizer.fit_transform(corpus)
    return features, vectorizer

#### Model Training

##### Logistic Regression

In [8]:
def train_logistic_regression(X, y):
    """Train and evaluate a Logistic Regression model."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    print("\n--- Logistic Regression Evaluation ---")
    evaluate_model(model, X_test, y_test)
    return model

##### Random Forest

In [9]:
def train_random_forest(X, y):
    """Train and evaluate a Random Forest Classifier."""
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    print("\n--- Random Forest Evaluation ---")
    evaluate_model(model, X_test, y_test)
    return model

##### Support Vector Machine

In [10]:
def train_svm(X, y):
    """Train and evaluate a Support Vector Machine (SVM)."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = SVC(probability=True)
    model.fit(X_train, y_train)
    print("\n--- SVM Evaluation ---")
    evaluate_model(model, X_test, y_test)
    return model


#### Evaluate Model

In [11]:
#Evaluation Function

def evaluate_model(model, X_test, y_test):
    """Evaluate the performance of a trained model."""
    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions))

#### Predict

In [12]:
def predict_emotion(model, text, vectorizer):
    """Predict the emotion of a single input text."""
    processed_text = preprocess_text(text)
    features = vectorizer.transform([processed_text])
    return model.predict(features)


In [22]:
df = load_data('sampled_data.csv')
explore_data(df, text_column='text')  # Adjust 'text' if your column name differs



--- Basic Information ---
Dataset Shape: (100000, 2)
Columns: ['text', 'label']
Missing Values:
text     0
label    0
dtype: int64
Duplicate Rows: 0

--- Class Distribution ---
label
0    20000
1    20000
2    20000
3    20000
4    20000
Name: count, dtype: int64

--- Text Analysis ---
Empty or Blank Texts: 0
Average Punctuation Count per Entry: 0.00
Average Emoji Count per Entry: 0.00
Average Stop Words per Entry: 9.99
Average Special Characters per Entry: 0.00

--- Recommendations ---
The dataset appears clean!


In [23]:
df = apply_preprocessing(df)

In [24]:
X, vectorizer = create_features(df['text'])  # 'text' is the name of your text column
y = df['label']  # Assuming the label column is named 'label'

logistic_model = train_logistic_regression(X, y)


--- Logistic Regression Evaluation ---
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      4018
           1       0.94      0.92      0.93      4015
           2       0.93      0.97      0.95      4002
           3       0.94      0.94      0.94      3987
           4       0.94      0.96      0.95      3978

    accuracy                           0.94     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.94      0.94      0.94     20000



In [34]:
# Define the label to emotion mapping
label_to_emotion = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear'
}


# text_to_predict = "I am so happy today!"
text_to_predict = "it doesn't necessarily convey sadness or disappointment."

# Use the trained Logistic Regression model
predicted_emotion_logistic = predict_emotion(logistic_model, text_to_predict, vectorizer)
print(f"Predicted emotion (Logistic Regression): {predicted_emotion_logistic}")


Predicted emotion (Logistic Regression): [0]


In [36]:

print("\n--- Logistic Regression Evaluation ---")
evaluate_model(logistic_model, X, y)




--- Logistic Regression Evaluation ---
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     20000
           1       0.96      0.93      0.95     20000
           2       0.94      0.98      0.96     20000
           3       0.96      0.96      0.96     20000
           4       0.95      0.97      0.96     20000

    accuracy                           0.96    100000
   macro avg       0.96      0.96      0.96    100000
weighted avg       0.96      0.96      0.96    100000



In [37]:
random_forest_model = train_random_forest(X, y)


--- Random Forest Evaluation ---
              precision    recall  f1-score   support

           0       0.96      0.91      0.94      4018
           1       0.97      0.89      0.93      4015
           2       0.91      0.99      0.95      4002
           3       0.93      0.94      0.94      3987
           4       0.93      0.97      0.95      3978

    accuracy                           0.94     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.94      0.94      0.94     20000



In [38]:
svm_model = train_svm(X, y)


--- SVM Evaluation ---
              precision    recall  f1-score   support

           0       0.96      0.92      0.94      4018
           1       0.96      0.91      0.93      4015
           2       0.91      0.98      0.95      4002
           3       0.95      0.93      0.94      3987
           4       0.94      0.97      0.95      3978

    accuracy                           0.94     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.94      0.94      0.94     20000



In [None]:
#LSTM
# Prepare data for LSTM
def prepare_data_for_lstm(df, text_column='text', label_column='label'):
    tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
    tokenizer.fit_on_texts(df[text_column])
    sequences = tokenizer.texts_to_sequences(df[text_column])
    padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')
    return padded_sequences, df[label_column].values, tokenizer


X_lstm, y_lstm, tokenizer_lstm = prepare_data_for_lstm(df)

In [None]:
# Train/Test split
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_lstm, y_lstm, test_size=0.2, random_state=42)


In [None]:
# Build LSTM model
def build_lstm_model(vocab_size, embedding_dim, input_length):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=input_length),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(64),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(5, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


In [None]:
lstm_model = build_lstm_model(vocab_size=20000, embedding_dim=100, input_length=100)
lstm_model.fit(X_train_lstm, y_train_lstm, validation_split=0.2, epochs=5, batch_size=32)


In [None]:
# Evaluate LSTM
lstm_predictions = lstm_model.predict(X_test_lstm)
lstm_pred_classes = np.argmax(lstm_predictions, axis=1)
print("\n--- LSTM Evaluation ---")
print(classification_report(y_test_lstm, lstm_pred_classes))


In [None]:
# Prepare data for BERT
def prepare_data_for_bert(df, text_column='text', label_column='label'):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer(list(df[text_column]), padding=True, truncation=True, return_tensors='pt', max_length=128)
    labels = torch.tensor(df[label_column].values)
    return inputs, labels

In [None]:
bert_inputs, bert_labels = prepare_data_for_bert(df)


In [None]:
# Train/Test split for BERT
train_size = int(0.8 * len(bert_labels))
train_inputs = {k: v[:train_size] for k, v in bert_inputs.items()}
train_labels = bert_labels[:train_size]
test_inputs = {k: v[train_size:] for k, v in bert_inputs.items()}
test_labels = bert_labels[train_size:]


In [None]:
# Fine-tune BERT model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

In [None]:
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=list(zip(train_inputs['input_ids'], train_labels)),
    eval_dataset=list(zip(test_inputs['input_ids'], test_labels))
)

In [None]:
trainer.train()

In [None]:
# Evaluate BERT
bert_outputs = bert_model(**test_inputs)
bert_pred_classes = torch.argmax(bert_outputs.logits, axis=1)
print("\n--- BERT Evaluation ---")
print(classification_report(test_labels.numpy(), bert_pred_classes.numpy()))