In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt


In [2]:
# Load your dataset
df = pd.read_csv('Roman_Urdu.csv', encoding = "latin1")


In [8]:
# Preprocessing
df['tweets'] = df['tweets'].astype(str)  # Ensure all data in the 'tweets' column are strings
df['tweets'].fillna('', inplace=True)    # Handle any NaN values by replacing them with empty strings

X = df['tweets'].values
y = df['label'].values


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['tweets'].fillna('', inplace=True)    # Handle any NaN values by replacing them with empty strings


In [9]:
# Encode the labels
le = LabelEncoder()
y = le.fit_transform(y)


In [10]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Tokenization and padding
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


In [12]:
# LSTM Model
embedding_dim = 128
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])




In [13]:
# Training the model
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=64, validation_split=0.1, verbose=2)


Epoch 1/10
338/338 - 52s - 155ms/step - accuracy: 0.7787 - loss: 0.4591 - val_accuracy: 0.8575 - val_loss: 0.3281
Epoch 2/10
338/338 - 43s - 126ms/step - accuracy: 0.8832 - loss: 0.2828 - val_accuracy: 0.8675 - val_loss: 0.3098
Epoch 3/10
338/338 - 40s - 118ms/step - accuracy: 0.9006 - loss: 0.2410 - val_accuracy: 0.8604 - val_loss: 0.3178
Epoch 4/10
338/338 - 39s - 116ms/step - accuracy: 0.9136 - loss: 0.2124 - val_accuracy: 0.8658 - val_loss: 0.3259
Epoch 5/10
338/338 - 38s - 113ms/step - accuracy: 0.9257 - loss: 0.1869 - val_accuracy: 0.8608 - val_loss: 0.3401
Epoch 6/10
338/338 - 40s - 118ms/step - accuracy: 0.9326 - loss: 0.1657 - val_accuracy: 0.8596 - val_loss: 0.4199
Epoch 7/10
338/338 - 41s - 120ms/step - accuracy: 0.9419 - loss: 0.1462 - val_accuracy: 0.8492 - val_loss: 0.4285
Epoch 8/10
338/338 - 44s - 130ms/step - accuracy: 0.9485 - loss: 0.1291 - val_accuracy: 0.8504 - val_loss: 0.4683
Epoch 9/10
338/338 - 39s - 115ms/step - accuracy: 0.9548 - loss: 0.1145 - val_accuracy: 

In [14]:
# Evaluating the model
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(classification_report(y_test, y_pred))


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 34ms/step
Accuracy: 0.8411666666666666
Precision: 0.8420032310177706
Recall: 0.8488599348534202
F1 Score: 0.8454176804541768
              precision    recall  f1-score   support

           0       0.84      0.83      0.84      2930
           1       0.84      0.85      0.85      3070

    accuracy                           0.84      6000
   macro avg       0.84      0.84      0.84      6000
weighted avg       0.84      0.84      0.84      6000

