<a href="https://colab.research.google.com/github/HassanCoulibaly/MachineLearningProject/blob/main/Machine_learning_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, LSTM, Dropout
import os

In [2]:
train_df = pd.read_csv('dataset/train_df.csv')
test_df = pd.read_csv('dataset/test_df.csv')


In [3]:
label_encoder = LabelEncoder()
label_encoder.fit(['negative', 'neutral', 'positive'])
train_labels = label_encoder.transform(train_df['sentiment'])
test_labels = label_encoder.transform(test_df['sentiment'])

In [7]:
vocab_size = 10000
max_length = 40
embedding_dim = 50

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['tweet'])
train_sequences = tokenizer.texts_to_sequences(train_df['tweet'])
test_sequences = tokenizer.texts_to_sequences(test_df['tweet'])

X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

y_train = tf.keras.utils.to_categorical(train_labels, num_classes=3)
y_test = tf.keras.utils.to_categorical(test_labels, num_classes=3)

print("Training data shape:", X_train.shape)
print("Training labels shape:", y_train.shape)

Training data shape: (53368, 40)
Training labels shape: (53368, 3)


In [8]:
glove_path = 'dataset/glove.6B.50d.txt'
embeddings_index = {}
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [9]:
cnn_lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim,
              input_length=max_length, weights=[embedding_matrix], trainable=False),
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(3, activation='softmax')
])

cnn_lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_lstm_model.summary()



In [10]:
cnn_lstm_model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/5
[1m1668/1668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 33ms/step - accuracy: 0.4923 - loss: 0.9651 - val_accuracy: 0.5752 - val_loss: 0.8963
Epoch 2/5
[1m1668/1668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 33ms/step - accuracy: 0.5869 - loss: 0.8554 - val_accuracy: 0.5816 - val_loss: 0.9167
Epoch 3/5
[1m1668/1668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 32ms/step - accuracy: 0.6140 - loss: 0.8158 - val_accuracy: 0.6013 - val_loss: 0.8511
Epoch 4/5
[1m1668/1668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 32ms/step - accuracy: 0.6242 - loss: 0.7988 - val_accuracy: 0.6132 - val_loss: 0.8442
Epoch 5/5
[1m1668/1668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 32ms/step - accuracy: 0.6384 - loss: 0.7778 - val_accuracy: 0.6193 - val_loss: 0.8392


<keras.src.callbacks.history.History at 0x79c3d00f8f50>