In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the dataset
data = pd.read_csv('urdu-sentiment-corpus-v1.tsv', delimiter='\t', header=None, names=['Tweet', 'Class'])

# Display the first few rows of the dataset
print(data.head())

# Preprocess the dataset
label_encoder = LabelEncoder()
data['Class'] = label_encoder.fit_transform(data['Class'])

# Fill NaN values in the target column with the most frequent class label
data['Class'].fillna(data['Class'].mode()[0], inplace=True)

# Convert any unknown values to NaN
# data['Class'] = pd.to_numeric(data['Class'], errors='coerce')

# Drop rows with NaN values in the target column
data.dropna(subset=['Class'], inplace=True)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Tweet'])
X = tokenizer.texts_to_sequences(data['Tweet'])
X = pad_sequences(X, padding='post')

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, data['Class'], test_size=0.25, random_state=42)

# Mapping 'P' class to 1 (positive sentiment) and other classes to 0 (negative sentiment)
y_test_binary = (y_test == 'P').astype(int)

# Define hyperparameters
num_layers = [2, 3]
dropout_rates = [0.3, 0.7]

results = []

for model_type in ['RNN', 'GRU', 'LSTM', 'BiLSTM']:
    for num_layer in num_layers:
        for dropout_rate in dropout_rates:
            # Build the model
            model = Sequential()
            model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=X.shape[1]))
            for _ in range(num_layer):
                if model_type == 'RNN':
                    model.add(SimpleRNN(64, return_sequences=True))
                elif model_type == 'GRU':
                    model.add(GRU(64, return_sequences=True))
                elif model_type == 'LSTM':
                    model.add(LSTM(64, return_sequences=True))
                elif model_type == 'BiLSTM':
                    model.add(Bidirectional(LSTM(64, return_sequences=True)))
                model.add(Dropout(dropout_rate))
            model.add(Dense(1, activation='sigmoid'))
            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

            # Train the model
            model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=0)

            # Evaluate the model
            y_pred_probs = model.predict(X_test)
            y_pred = np.where(y_pred_probs > 0.5, 1, 0)  # Apply threshold to get class labels

            # Calculate evaluation metrics
            accuracy = accuracy_score(y_test_binary, y_pred)
            precision = precision_score(y_test_binary, y_pred)
            recall = recall_score(y_test_binary, y_pred)
            f1 = f1_score(y_test_binary, y_pred)

            # Append results to the list
            results.append([model_type, num_layer, dropout_rate, accuracy, precision, recall, f1])

# Create a DataFrame to display results
results_df = pd.DataFrame(results, columns=['Model', 'Num_Layers', 'Dropout_Rate', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])
print(results_df)

                                               Tweet  Class
0                                              Tweet  Class
1  میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...      P
2  چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...      N
3                           ٹویٹر کا خیال کیسے آیا ؟      O
4  سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...      P


ValueError: Classification metrics can't handle a mix of binary and unknown targets