In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dense, Dropout
from sklearn.metrics import precision_score, recall_score, f1_score

# Load Dataset

In [56]:
data = pd.read_csv('urdu-sentiment-corpus-v1.tsv', sep='\t', header=None, names=['Tweet', 'Class'])
data.head(10)

Unnamed: 0,Tweet,Class
0,Tweet,Class
1,میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...,P
2,چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...,N
3,ٹویٹر کا خیال کیسے آیا ؟,O
4,"سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...",P
5,ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ,P
6,گندی زبان اور گٹر جیسے دماغ والے جاهل جیالے ه...,N
7,"قاتل بھی تم مقتول بھی تم,ظالم بھی ہم اور مظلوم...",N
8,ﻧﻮﺭ ﺑﻐﺪﺍﺩ ﮐﯽ ﮔﻠﯿﻮﮞ ﮐﺎ ﮨﮯ ﮨﺮ ﺍﯾﮏ ﮐﺮﻥ ﺍﺱ ﮐﯽ ﻣﺪﯾﻨ...,P
9,یہ لفظوں كى شرارت ھے سنبهل کر کچھ بهى لِكهنا ...,P


# Filter out entries that are not positive or negative

In [57]:
data = data[(data['Class'] == 'P') | (data['Class'] == 'N')]
data

Unnamed: 0,Tweet,Class
1,میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...,P
2,چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...,N
4,"سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...",P
5,ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ,P
6,گندی زبان اور گٹر جیسے دماغ والے جاهل جیالے ه...,N
...,...,...
996,اُس آدمی نے اِس سالار کو کافی معقول ٹپ دی ہے ۔,P
997,چچا غالب کی روح سے معذرت کے ساتھہم نے مانا کہ ...,P
998,واہ جناب واہ! اچھی رہی۔ جناب خود کو فرشتہ سمجو...,P
999,اسلام آباد :پی اے ٹی کا دھرنا ختم، صفائی کے کا...,P


# Encode class labels to numerical values (P: 1, N: 0)

In [58]:
label_encoder = LabelEncoder()
data['Class'] = label_encoder.fit_transform(data['Class'])

# Tokenize text

In [59]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Tweet'])
sequences = tokenizer.texts_to_sequences(data['Tweet'])

In [60]:
max_sequence_len = max([len(x) for x in sequences])
X = pad_sequences(sequences, maxlen=max_sequence_len)
y = data['Class'].values

# Split the data into training and testing sets

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [62]:
vocab_size = len(tokenizer.word_index) + 1

def build_model(model_type, input_length, vocab_size, dropout_rate):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=input_length))

    if model_type == 'RNN':
        model.add(SimpleRNN(units=100, dropout=dropout_rate, recurrent_dropout=dropout_rate))
    elif model_type == 'GRU':
        model.add(GRU(units=100, dropout=dropout_rate, recurrent_dropout=dropout_rate))
    elif model_type == 'LSTM':
        model.add(LSTM(units=100, dropout=dropout_rate, recurrent_dropout=dropout_rate))
    elif model_type == 'BiLSTM':
        model.add(Bidirectional(LSTM(units=100, dropout=dropout_rate, recurrent_dropout=dropout_rate)))

    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [63]:
rnn_model = build_model('RNN', max_sequence_len, vocab_size, dropout_rate=0.3)
gru_model = build_model('GRU', max_sequence_len, vocab_size, dropout_rate=0.3)
lstm_model = build_model('LSTM', max_sequence_len, vocab_size, dropout_rate=0.3)
bilstm_model = build_model('BiLSTM', max_sequence_len, vocab_size, dropout_rate=0.3)

 # Training and Evaluation

In [64]:
from tabulate import tabulate

print("\n ***************** Training and Evaluation *****************")

def train_and_evaluate(model, model_name):
    print(f"\nTraining {model_name} Model:")
    model.fit(X_train, y_train, batch_size=64, epochs=10, validation_split=0.2)
    predictions = model.predict(X_test) > 0.5
    accuracy = model.evaluate(X_test, y_test)[1]
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f_score = f1_score(y_test, predictions)
    return [model_name, accuracy, precision, recall, f_score]

results = []
results.append(train_and_evaluate(rnn_model, " ***************** RNN Model *****************"))
results.append(train_and_evaluate(gru_model, " ***************** ***************** GRU Model *****************"))
results.append(train_and_evaluate(lstm_model, " ***************** LSTM Model *****************"))
results.append(train_and_evaluate(bilstm_model, " ***************** BiLSTM Model *****************"))

print("\nResults:")
print(tabulate(results, headers=["Model", "Accuracy", "Precision", "Recall", "F-score"], tablefmt="grid"))


 ***************** Training and Evaluation *****************

Training  ***************** RNN Model ***************** Model:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Training  ***************** ***************** GRU Model ***************** Model:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Training  ***************** LSTM Model ***************** Model:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Training  ***************** BiLSTM Model ***************** Model:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Results:
+-----------------------------------------------------------------+------------+-------------+----------+-----------+
| Model                                                           |   Accuracy |   Preci