## 📌 Note to Examiner: 
#### I had previously written and executed the code in this section. However, due to unforeseen circumstances, that version of the notebook was lost. I have rewritten the code for documentation purposes, but I won't run it here since I've already trained my model. Executing it again would be redundant and time-consuming. Please consider this while evaluating.

# 1. LSTM Model

In [25]:
import pandas as pd
import numpy as np
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder


In [6]:
# Load the cleaned data
data = pd.read_csv('cleaned data.csv')
data.head()

Unnamed: 0,Text,Classification
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [7]:
# Load the tokenizer
with open('tokenizer.json', 'r') as file:
    tokenizer_data = json.load(file)
    tokenizer = tokenizer_from_json(tokenizer_data)

In [8]:
# Convert text to sequences
sequences = tokenizer.texts_to_sequences(data['Text'])
maxlen = 250  # or whatever was your previous max length
X = pad_sequences(sequences, maxlen=maxlen)


In [None]:
# Convert text to sequences
sequences = tokenizer.texts_to_sequences(data['Text'])
maxlen = 250  # or whatever was your previous max length
X = pad_sequences(sequences, maxlen=maxlen)

# Convert labels to one-hot encoding
le = LabelEncoder()
y = le.fit_transform(data['Classification'])
y = np.eye(len(le.classes_))[y]

In [None]:
embedding_dim = 128
vocab_size = len(tokenizer.word_index) + 1  

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=maxlen),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(len(le.classes_), activation='softmax')  # number of classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


In [None]:
# Evaluation
accuracy = accuracy_score(y_true_classes, y_pred_classes)
precision = precision_score(y_true_classes, y_pred_classes, average='weighted')
recall = recall_score(y_true_classes, y_pred_classes, average='weighted')
f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')

evaluation = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
    "Value": [accuracy, precision, recall, f1]
}

# Save evaluation metrics to CSV
eval_df = pd.DataFrame(evaluation)
eval_df.to_csv('evaluation_LSTM.csv', index=False)


# 2. MNB model

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import joblib
import os

In [15]:
# Load the cleaned dataset
data = pd.read_csv("cleaned data.csv")

In [17]:
# Check if 'Text' and 'Classification' columns exist
if 'Text' not in data.columns or 'Classification' not in data.columns:
    raise ValueError("Required columns ('Text' and 'Classification') not found in the dataset.")

# Load the previously saved TF-IDF vectorizer
vectorizer = joblib.load(r"model\tfidf_vectorizer.joblib")

In [18]:
# Transform the texts in the dataset
X = vectorizer.transform(data['Text'])
y = data['Classification']

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Multinomial Naive Bayes model
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = mnb.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Save the trained MNB model (optional)
joblib.dump(mnb, "mnb_model.joblib")

In [None]:
# Evaluate the model
report = classification_report(y_test, y_pred, output_dict=True, target_names=le.classes_)

# Create DataFrame from the report and only include needed rows and columns
df_report = pd.DataFrame(report).transpose()
df_report = df_report[['precision', 'recall', 'f1-score', 'support']]
df_report = df_report.loc[le.classes_.tolist() + ['accuracy', 'macro avg', 'weighted avg'], :]

# Save the DataFrame to CSV
df_report.to_csv('evaluation_MNB.csv', index_label='class')


# 3. Random Forest Model

In [19]:
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [23]:
# Load data
data = pd.read_csv('cleaned data.csv')
texts = data['Text'].values
labels = data['Classification'].values


In [20]:
# Load tokenizer
with open('tokenizer.json', 'r') as file:
    json_string = json.load(file)
    tokenizer = tokenizer_from_json(json_string)


In [24]:
# Tokenize and pad sequences
maxlen = 250
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Train RF classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

In [None]:
# Evaluate the model
report = classification_report(y_test, y_pred, output_dict=True)

# Convert evaluation metrics to DataFrame and save as CSV
df_report = pd.DataFrame(report).transpose()
df_report.to_csv('rf_evaluation.csv')


# 4. Logistic Regression Model

In [26]:
from sklearn.linear_model import LogisticRegression



In [27]:
# Ensure 'Text' and 'Classification' columns exist
if 'Text' not in data.columns or 'Classification' not in data.columns:
    raise ValueError("The CSV must contain 'Text' and 'Classification' columns.")


In [28]:
# Load the tokenizer
with open('tokenizer.json', 'r') as file:
    tokenizer_data = json.load(file)
    tokenizer = tokenizer_from_json(tokenizer_data)

In [29]:
# Convert the text data into sequences using the tokenizer
X = tokenizer.texts_to_sequences(data['Text'])
X = pad_sequences(X)

# Convert the classification labels into integers
y = pd.factorize(data['Classification'])[0]




In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increasing max_iter for better convergence
model.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)

# Save the evaluation metrics to a CSV file
df_report = pd.DataFrame(report).transpose()
df_report.to_csv('evaluation_LR.csv')


# 5.  CNN Text Classification

In [30]:
# Prepare data
max_features = len(tokenizer.word_index) + 1
maxlen = 250
sequences = tokenizer.texts_to_sequences(data['Text'])
data_seq = pad_sequences(sequences, maxlen=maxlen)

# Convert labels to categorical format
labels = pd.get_dummies(data['Classification']).values


In [31]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data_seq, labels, test_size=0.2, random_state=42)


In [None]:
# Define CNN model
embedding_dim = 128
filters = 128
kernel_size = 6

model = Sequential([
    Embedding(max_features, embedding_dim, input_length=maxlen),
    Conv1D(filters, kernel_size, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(labels.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


In [None]:
# Evaluate the model
y_pred = model.predict(X_test).argmax(axis=1)
y_true = y_test.argmax(axis=1)

report = classification_report(y_true, y_pred, output_dict=True)

# Save evaluation metrics in the desired format
df_report = pd.DataFrame({
    'precision': report['weighted avg']['precision'],
    'recall': report['weighted avg']['recall'],
    'f1-score': report['weighted avg']['f1-score'],
    'support': report['weighted avg']['support']
}, index=[0])