In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense
from keras.callbacks import EarlyStopping

# Load the data set

In [None]:
file = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file, sep='\t', header=0)

# Filter and Encode

In [None]:
df = df[df['Class'] != 'O']
label_encoder = LabelEncoder()
df['Class'] = label_encoder.fit_transform(df['Class'])

# Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Tweet'], df['Class'], test_size=0.2, random_state=42)

# Tokenize tweets

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
vocab_size = len(tokenizer.word_index) + 1

len = max([len(s.split()) for s in X_train])
X_train_padded = pad_sequences(X_train_seq, maxlen=len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=len, padding='post')

# GRU model without Word2Vec embeddings

In [None]:
def GRU_without_Embedding():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=len))
    model.add(GRU(100, return_sequences=True))
    model.add(GRU(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
gru_without_emb = GRU_without_Embedding()
gru_without_emb.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78d9bdb20df0>

# Word2Vec model

In [None]:
word2vec_model = Word2Vec(sentences=[sentence.split() for sentence in X_train], vector_size=100, window=5, min_count=1, workers=4)

word2vec_embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    try:
        word2vec_embedding_matrix[i] = word2vec_model.wv[word]
    except KeyError:
        continue

# GRU model with Word2Vec embeddings

In [None]:
gru_with_w2vec = Sequential()
gru_with_w2vec.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[word2vec_embedding_matrix], input_length=len, trainable=False))
gru_with_w2vec.add(GRU(100, return_sequences=True))
gru_with_w2vec.add(GRU(100))
gru_with_w2vec.add(Dense(1, activation='sigmoid'))
gru_with_w2vec.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
gru_with_w2vec.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78d9adcc0e50>

In [None]:
def evaluate_model(model, X_test_padded, y_test):
    y_pred = (model.predict(X_test_padded) > 0.5).astype('int32')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_score = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f_score

In [None]:
accuracy_without_embeddings, precision_without_embeddings, recall_without_embeddings, f_score_without_embeddings = evaluate_model(gru_without_emb, X_test_padded, y_test)
accuracy_word2vec, precision_word2vec, recall_word2vec, f_score_word2vec = evaluate_model(gru_with_w2vec, X_test_padded, y_test)



In [None]:
print("Results:")
print("GRU without embeddings:")
print(f"Accuracy: {accuracy_without_embeddings}")
print(f"Precision: {precision_without_embeddings}")
print(f"Recall: {recall_without_embeddings}")
print(f"F-score: {f_score_without_embeddings}")

Results:
GRU without embeddings:
Accuracy: 0.6173469387755102
Precision: 0.6351351351351351
Recall: 0.49473684210526314
F-score: 0.5562130177514794


In [None]:
print("\nGRU with Word2Vec embeddings:")
print(f"Accuracy: {accuracy_word2vec}")
print(f"Precision: {precision_word2vec}")
print(f"Recall: {recall_word2vec}")
print(f"F-score: {f_score_word2vec}")


GRU with Word2Vec embeddings:
Accuracy: 0.5204081632653061
Precision: 1.0
Recall: 0.010526315789473684
F-score: 0.020833333333333332
