In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from keras.callbacks import ProgbarLogger
from sklearn.model_selection import GridSearchCV
from keras.callbacks import ProgbarLogger
import tensorflow_hub as hub

In [None]:
df = pd.read_csv("tweet_emotions.csv")

In [None]:
df = df.reindex(columns=['tweet_id', 'content', 'sentiment'])

In [None]:
X = df.iloc[:,1].values.reshape(-1, 1)
y = df.iloc[:,-1].values
over = RandomOverSampler()
X, y = over.fit_resample(X, y)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
label_encoder = LabelEncoder()

# Perform label encoding on y
y_encoded = label_encoder.fit_transform(y) + 1

# Reshape 'content' array to be 1-dimensional
X_reshaped = X.reshape(-1)

# Create a new DataFrame with 'sentiment' and 'content' columns
df = pd.DataFrame({'content': X_reshaped,'sentiment': y_encoded})


In [None]:
df.head()

In [None]:
train, val, test = np.split(df.sample(frac=1), [int(0.8 * len(df)), int(0.9 * len(df))])

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=64):
    df = dataframe.copy()
    labels = df.pop('sentiment')
    df = df["content"]

    # Perform label encoding on labels
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)
    num_classes = len(label_encoder.classes_)

    # Convert labels to one-hot encoded format
    labels_one_hot = tf.keras.utils.to_categorical(labels_encoded, num_classes=num_classes)

    ds = tf.data.Dataset.from_tensor_slices((df, labels_one_hot))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

#Embedding + Model

In [None]:
# Define the embedding layer
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)

train_data = df_to_dataset(train)
val_data = df_to_dataset(val)
test_data = df_to_dataset(test)

In [None]:
num_classes = len([i for i in set(y)])

In [None]:
# Create the model
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(16, activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(64, activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(64, activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(num_classes, activation="softmax"))

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.CategoricalHinge(),
    metrics=['accuracy']
)

# Evaluate the model on the validation_data
model.evaluate(val_data)


In [None]:
history = model.fit(train_data, epochs=5, validation_data=val_data)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_data)

# Print the test loss and accuracy
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

#improve the model's performance:
1- Increase the model's capacity

2- Increase the model's capacity

3- Use a different optimizer

4- Increase the number of training epochs

5- Perform data preprocessing

6- Perform data preprocessing

7- Perform data preprocessing


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['content'])
sequences = tokenizer.texts_to_sequences(df['content'])

# Pad the sequences to have the same length
max_length = 100  # adjust as needed
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Convert sentiment to one-hot encoded vectors
sentiment_vectors = tf.keras.utils.to_categorical(df['sentiment'])

# Create the input and output data for the model
X = padded_sequences
y = sentiment_vectors
# Split the data into train and test sets
train_size = int(0.8 * len(X))
train_X, test_X = X[:train_size], X[train_size:]
train_y, test_y = y[:train_size], y[train_size:]
input_shape = X.shape[1]  # Assuming X is a NumPy array of shape (num_samples, num_features)
vocab_size = len(np.unique(np.concatenate(X)))
embedding_dim = 100  # Specify the dimensionality of the embedding
max_length = max_length = max(len(sequence) for sequence in X)  # Specify the maximum length of your input sequences

In [None]:
num_classes = y.shape[1]  # Extract the number of classes from the target data

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['content'])
sequences = tokenizer.texts_to_sequences(df['content'])

# Pad the sequences to have the same length
max_length = 100  # adjust as needed
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Convert sentiment to one-hot encoded vectors
sentiment_vectors = tf.keras.utils.to_categorical(df['sentiment'])

# Create the input and output data for the model
X = padded_sequences
y = sentiment_vectors

# Split the data into train and test sets
train_size = int(0.8 * len(X))
train_X, test_X = X[:train_size], X[train_size:]
train_y, test_y = y[:train_size], y[train_size:]

# Specify the dimensionality of the embedding
embedding_dim = 100

# Specify the maximum length of your input sequences
max_length = max(len(sequence) for sequence in X)

# Specify the size of your vocabulary
vocab_size = len(tokenizer.word_index) + 1

# Build the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(y.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history =model.fit(train_X, train_y,batch_size=64, epochs=5,validation_data=(test_X, test_y), verbose=0, callbacks=[ProgbarLogger(count_mode='steps')])
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_X, test_y)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("tweet_emotions.csv")

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['content'])
sequences = tokenizer.texts_to_sequences(df['content'])

In [None]:
# Split the data into train and test sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Specify the dimensionality of the embedding
embedding_dim = 100

In [None]:
# Define the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_X, test_y)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


In [None]:
from tensorflow.keras.optimizers import RMSprop

# Define the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

# Compile the model with RMSprop optimizer
optimizer = RMSprop(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
test_loss, test_accuracy = model.evaluate(test_X, test_y)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


In [None]:
model.fit(train_X, train_y, epochs=5, batch_size=16, validation_data=(test_X, test_y))

In [None]:
history = model.fit(train_X, train_y, epochs=5, batch_size=16, validation_data=(test_X, test_y))

In [None]:
# Retrieve the last accuracy value
last_accuracy = round(history.history['accuracy'][-1], 2)
print("Last Accuracy:", last_accuracy)

# Retrieve the last loss value
last_loss = round(history.history['loss'][-1], 2)
print("Last Loss:", last_loss)

# Retrieve the validation accuracy value
val_accuracy = round(history.history['val_accuracy'][-1], 2)
print("Validation Accuracy:", val_accuracy)
