### Emotion Analysis

In [3]:
!pip install emoji
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
!pip install keras
!pip install tensorflow



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/apple2015/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/apple2015/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!




#### Import Libraries

In [7]:
import pandas as pd
import re
import emoji
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# from tensorflow.keras import layers, models

ModuleNotFoundError: No module named 'tensorflow.keras'

#### Load and Explore Data

In [None]:
def load_data(file_path):
    """Load the dataset from a file."""
    return pd.read_csv(file_path)

In [None]:
def explore_data(df, text_column='text'):
    """
    Perform an extensive exploration of the dataset to check data cleanliness.

    Parameters:
    - df: Pandas DataFrame
    - text_column: Name of the column containing text data

    Returns:
    - Summary of findings
    """
    print("\n--- Basic Information ---")
    print(f"Dataset Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Missing Values:\n{df.isnull().sum()}")
    print(f"Duplicate Rows: {df.duplicated().sum()}")

    print("\n--- Class Distribution ---")
    if 'label' in df.columns:
        print(df['label'].value_counts())
    else:
        print("No 'label' column found!")

    print("\n--- Text Analysis ---")
    # Check for empty or blank text
    empty_texts = df[text_column].isnull().sum() + df[text_column].str.strip().eq('').sum()
    print(f"Empty or Blank Texts: {empty_texts}")

    # Check for punctuation
    punctuations = df[text_column].apply(lambda x: len(re.findall(r'[^\w\s]', str(x))))
    print(f"Average Punctuation Count per Entry: {punctuations.mean():.2f}")

    # Check for emojis
    emojis = df[text_column].apply(lambda x: len(emoji.emoji_list(str(x))))
    print(f"Average Emoji Count per Entry: {emojis.mean():.2f}")

    # Check for stop words
    stop_words = set(stopwords.words('english'))
    stop_word_counts = df[text_column].apply(lambda x: len([word for word in str(x).split() if word.lower() in stop_words]))
    print(f"Average Stop Words per Entry: {stop_word_counts.mean():.2f}")

    # Check for special characters (non-alphanumeric)
    special_chars = df[text_column].apply(lambda x: len(re.findall(r'[^\w\s]', str(x))))
    print(f"Average Special Characters per Entry: {special_chars.mean():.2f}")

    print("\n--- Recommendations ---")
    recommendations = []
    if empty_texts > 0:
        recommendations.append(f"Remove or handle {empty_texts} empty or blank entries.")
    if df.duplicated().sum() > 0:
        recommendations.append("Remove duplicate rows.")
    if emojis.mean() > 0:
        recommendations.append("Consider handling emojis (e.g., replace with words or remove).")
    if punctuations.mean() > 0:
        recommendations.append("Remove or handle punctuation marks appropriately.")
    if special_chars.mean() > 0:
        recommendations.append("Clean special characters from text.")

    if recommendations:
        print("\n".join(recommendations))
    else:
        print("The dataset appears clean!")

In [None]:
#Drop empty rows from our data
def drop_empty_rows(df):
    """Drop rows with empty text values."""
    return df.dropna(subset=['text']).reset_index(drop=True)

#### Preprocess Text

In [None]:
def remove_emojis(text):
    """Remove emojis from text."""
    return emoji.replace_emoji(text, replace="")

def remove_punctuation_and_symbols(text):
    """Remove punctuation, numbers, and special characters."""
    return re.sub(r'[^a-zA-Z\s]', '', text)

def preprocess_text(text):
    """Clean and preprocess text."""
    text = remove_emojis(text)
    text = remove_punctuation_and_symbols(text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

def apply_preprocessing(df):
    """Apply preprocessing to the text column."""
    df['text'] = df['text'].apply(preprocess_text)
    return df


#### Feature Engineering

In [None]:
def create_features_Tfidf(corpus):
    """Convert text into numerical representations."""
    vectorizer = TfidfVectorizer(max_features=5000)
    features = vectorizer.fit_transform(corpus)
    return features, vectorizer

def create_features_CountVectorizer(corpus):
    """Convert text into numerical representations."""
    vectorizer = CountVectorizer(max_features=5000)
    features = vectorizer.fit_transform(corpus)
    return features, vectorizer

def create_features_HashingVectorizer(corpus):
    """Convert text into numerical representations using HashingVectorizer."""
    vectorizer = HashingVectorizer(n_features=5000)
    features = vectorizer.fit_transform(corpus)
    return features, vectorizer

def create_features_Word2Vec(corpus):
    """Convert text into numerical representations using Word2Vec."""
    tokenized_corpus = [doc.split() for doc in corpus]
    model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
    features = np.array([np.mean([model.wv[word] for word in doc if word in model.wv] or [np.zeros(100)], axis=0) for doc in tokenized_corpus])
    return features, model

#### Model Training

##### Logistic Regression

In [None]:
def train_logistic_regression(X, y):
    """Train and evaluate a Logistic Regression model."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    print("\n--- Logistic Regression Evaluation ---")
    evaluate_model(model, X_test, y_test)
    return model

##### Random Forest

In [None]:
def train_random_forest(X, y):
    """Train and evaluate a Random Forest Classifier."""
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    print("\n--- Random Forest Evaluation ---")
    evaluate_model(model, X_test, y_test)
    return model

##### Support Vector Machine

In [None]:
def train_svm(X, y):
    """Train and evaluate a Support Vector Machine (SVM)."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = SVC(probability=True)
    model.fit(X_train, y_train)
    print("\n--- SVM Evaluation ---")
    evaluate_model(model, X_test, y_test)
    return model


#### Evaluate Model

In [None]:
#Evaluation Function

def evaluate_model(model, X_test, y_test):
    """Evaluate the performance of a trained model."""
    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions))

#### Predict

In [None]:
def predict_emotion(model, text, vectorizer):
    """Predict the emotion of a single input text."""
    processed_text = preprocess_text(text)
    features = vectorizer.transform([processed_text])
    return model.predict(features)


In [None]:
df = load_data('sampled_data.csv')
explore_data(df, text_column='text')  # Adjust 'text' if your column name differs


In [None]:
df = apply_preprocessing(df)

In [None]:
X, vectorizer = create_features(df['text'])  # 'text' is the name of your text column
y = df['label']  # Assuming the label column is named 'label'

logistic_model = train_logistic_regression(X, y)

In [None]:
# Define the label to emotion mapping
label_to_emotion = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear'
}


# text_to_predict = "I am so happy today!"
text_to_predict = "it doesn't necessarily convey sadness or disappointment."

# Use the trained Logistic Regression model
predicted_emotion_logistic = predict_emotion(logistic_model, text_to_predict, vectorizer)
print(f"Predicted emotion (Logistic Regression): {predicted_emotion_logistic}")


In [None]:

print("\n--- Logistic Regression Evaluation ---")
evaluate_model(logistic_model, X, y)



In [None]:
random_forest_model = train_random_forest(X, y)

In [None]:
svm_model = train_svm(X, y)

In [None]:
#LSTM
# Prepare data for LSTM
# define model
lstm_model = Sequential()
lstm_model.add(LSTM(50, activation='relu', input_shape=(n_steps, n_features)))
lstm_model.add(Dense(1))
lstm_model.compile(optimizer='adam', loss='mse')
# fit model
lstm_model.fit(X, y, epochs=200, verbose=0)

In [None]:

lstm_predictions = lstm_model.predict(text_to_predict)

In [None]:
# Train/Test split
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_lstm, y_lstm, test_size=0.2, random_state=42)


In [None]:
# Build LSTM model
def build_lstm_model(vocab_size, embedding_dim, input_length):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=input_length),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(64),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(5, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


In [None]:
lstm_model = build_lstm_model(vocab_size=20000, embedding_dim=100, input_length=100)
lstm_model.fit(X_train_lstm, y_train_lstm, validation_split=0.2, epochs=5, batch_size=32)


In [None]:
# Evaluate LSTM
lstm_predictions = lstm_model.predict(X_test_lstm)
lstm_pred_classes = np.argmax(lstm_predictions, axis=1)
print("\n--- LSTM Evaluation ---")
print(classification_report(y_test_lstm, lstm_pred_classes))


In [None]:
# Prepare data for BERT
def prepare_data_for_bert(df, text_column='text', label_column='label'):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer(list(df[text_column]), padding=True, truncation=True, return_tensors='pt', max_length=128)
    labels = torch.tensor(df[label_column].values)
    return inputs, labels

In [None]:
bert_inputs, bert_labels = prepare_data_for_bert(df)


In [None]:
# Train/Test split for BERT
train_size = int(0.8 * len(bert_labels))
train_inputs = {k: v[:train_size] for k, v in bert_inputs.items()}
train_labels = bert_labels[:train_size]
test_inputs = {k: v[train_size:] for k, v in bert_inputs.items()}
test_labels = bert_labels[train_size:]


In [None]:
# Fine-tune BERT model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

In [None]:
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=list(zip(train_inputs['input_ids'], train_labels)),
    eval_dataset=list(zip(test_inputs['input_ids'], test_labels))
)

In [None]:
trainer.train()

In [None]:
# Evaluate BERT
bert_outputs = bert_model(**test_inputs)
bert_pred_classes = torch.argmax(bert_outputs.logits, axis=1)
print("\n--- BERT Evaluation ---")
print(classification_report(test_labels.numpy(), bert_pred_classes.numpy()))