# Q1) Sentiment Analysis

In [4]:
# For deep learning
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Bidirectional, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [29]:
# For transformers (BERT, XLM-R)
!pip install -q transformers

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import XLMRobertaTokenizer, TFXLMRobertaForSequenceClassification

import torch
from torch.optim import AdamW

# To ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [21]:
df = pd.read_csv('/kaggle/input/urduuusentiment/urdu-sentiment-corpus-v1.tsv', sep='\t')

df.head()


Unnamed: 0,Tweet,Class
0,میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...,P
1,چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...,N
2,ٹویٹر کا خیال کیسے آیا ؟,O
3,"سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...",P
4,ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ,P


In [22]:
# Drop "Other" (O) classes because we want only P and N
df = df[df['Class'].isin(['P', 'N'])]

# Reset index (optional)
df = df.reset_index(drop=True)

# Check
print(df['Class'].value_counts())
df.head()

Class
N    499
P    480
Name: count, dtype: int64


Unnamed: 0,Tweet,Class
0,میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...,P
1,چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...,N
2,"سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...",P
3,ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ,P
4,گندی زبان اور گٹر جیسے دماغ والے جاهل جیالے ه...,N


In [23]:
# Encode labels: P -> 1, N -> 0
df['label'] = df['Class'].map({'P': 1, 'N': 0})

# Separate texts and labels
texts = df['Tweet'].values
labels = df['label'].values


In [24]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.25, random_state=42, stratify=labels
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


Training samples: 734
Testing samples: 245


In [13]:
lengths = [len(text.split()) for text in X_train]
print(f"Average tweet length: {np.mean(lengths)}")
print(f"95th percentile length: {np.percentile(lengths, 95)}")


Average tweet length: 17.569482288828336
95th percentile length: 30.0


In [14]:
all_text = ' '.join(X_train)
unique_words = set(all_text.split())
print(f"Total unique words: {len(unique_words)}")


Total unique words: 4617


In [30]:
# Tokenization
vocab_size = 10000  
max_length = 50  

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')


# Custom Sequence Models

**RNN Implementation**

In [19]:
def build_rnn_model(vocab_size, max_length):
    model = Sequential()
    # Embedding layer with proper input shape
    model.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length))
    model.add(SimpleRNN(64, return_sequences=False))  # Simple RNN layer
    model.add(Dropout(0.5))  # Dropout for regularization
    model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid for binary classification
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  # Compilation
    return model


In [25]:
# Train Simple RNN
history = rnn_model.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_pad, y_test)
)

Epoch 1/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9994 - loss: 0.0021 - val_accuracy: 0.5306 - val_loss: 1.9161
Epoch 2/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9951 - loss: 0.0060 - val_accuracy: 0.5306 - val_loss: 2.0603
Epoch 3/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9990 - loss: 0.0020 - val_accuracy: 0.5265 - val_loss: 1.9886
Epoch 4/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0031 - val_accuracy: 0.5347 - val_loss: 1.9757
Epoch 5/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9996 - loss: 0.0015 - val_accuracy: 0.5184 - val_loss: 1.9692
Epoch 6/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9993 - loss: 0.0019 - val_accuracy: 0.5265 - val_loss: 2.0104
Epoch 7/20
[1m23/23[0m [32m━━━━━━━━━

In [26]:
# Predictions
y_pred_rnn = (rnn_model.predict(X_test_pad) > 0.5).astype('int32')

# Metrics
print("Simple RNN Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rnn))
print("Precision:", precision_score(y_test, y_pred_rnn))
print("Recall:", recall_score(y_test, y_pred_rnn))
print("F1 Score:", f1_score(y_test, y_pred_rnn))


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Simple RNN Results:
Accuracy: 0.49387755102040815
Precision: 0.4852941176470588
Recall: 0.55
F1 Score: 0.5156249999999999


**Hyperparameter Tuning for RNN**

In [27]:
!pip install -q keras-tuner


In [28]:
import keras_tuner as kt
from tensorflow.keras.layers import Embedding, SimpleRNN, Dropout, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential

# Define model-building function with tunable hyperparameters
def build_rnn_model(hp):
    model = Sequential()
    
    # Tune the embedding output dimension
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=hp.Int('embedding_dim', min_value=64, max_value=256, step=64), 
                        input_length=max_length))
    
    # Tune RNN units and add BiRNN
    model.add(Bidirectional(SimpleRNN(
        units=hp.Int('rnn_units', min_value=64, max_value=256, step=64), 
        return_sequences=False)))
    
    # Tune dropout rate
    model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    
    # Output layer
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model with tunable learning rate
    model.compile(
        loss='binary_crossentropy', 
        optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='log')),
        metrics=['accuracy']
    )
    
    return model


In [29]:
# Define the Hyperparameter tuner using RandomSearch or Hyperband
tuner = kt.RandomSearch(
    build_rnn_model,
    objective='val_accuracy',
    max_trials=5,  # Number of models to try
    executions_per_trial=3,  # Number of models to run for each trial
    directory='rnn_tuning',
    project_name='rnn_hyperparam_tuning'
)

# Perform the search using the training data
tuner.search(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best Hyperparameters: {best_hps}")


Trial 5 Complete [00h 00m 34s]
val_accuracy: 0.5646258393923441

Best val_accuracy So Far: 0.5646258393923441
Total elapsed time: 00h 02m 51s
Best Hyperparameters: <keras_tuner.src.engine.hyperparameters.hyperparameters.HyperParameters object at 0x7dd43836a650>


In [30]:
# Build the model with the best hyperparameters
best_model = tuner.hypermodel.build(best_hps)

# Train the model with the best hyperparameters
history = best_model.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_pad, y_test)
)

# Evaluate the model
y_pred_best = (best_model.predict(X_test_pad) > 0.5).astype('int32')

# Metrics
print("Best Model Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("Precision:", precision_score(y_test, y_pred_best))
print("Recall:", recall_score(y_test, y_pred_best))
print("F1 Score:", f1_score(y_test, y_pred_best))


Epoch 1/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 120ms/step - accuracy: 0.5302 - loss: 0.6885 - val_accuracy: 0.4857 - val_loss: 0.7026
Epoch 2/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.6014 - loss: 0.6718 - val_accuracy: 0.5061 - val_loss: 0.6976
Epoch 3/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7102 - loss: 0.6350 - val_accuracy: 0.4816 - val_loss: 0.7008
Epoch 4/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8571 - loss: 0.5486 - val_accuracy: 0.4939 - val_loss: 0.7155
Epoch 5/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9288 - loss: 0.4008 - val_accuracy: 0.5102 - val_loss: 0.7569
Epoch 6/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9673 - loss: 0.2401 - val_accuracy: 0.5429 - val_loss: 0.7720
Epoch 7/20
[1m23/23[0m [32m━━━

**GRU Implementation**

In [41]:
# Define model-building function for GRU with tunable hyperparameters
def build_gru_model(hp):
    model = Sequential()
    
    # Tune the embedding output dimension
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=hp.Int('embedding_dim', min_value=64, max_value=256, step=64), 
                        input_length=max_length))
    
    # Tune GRU units and add Bidirectional GRU
    model.add(Bidirectional(GRU(
        units=hp.Int('gru_units', min_value=64, max_value=256, step=64), 
        return_sequences=False)))
    
    # Tune dropout rate
    model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    
    # Output layer
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model with tunable learning rate
    model.compile(
        loss='binary_crossentropy', 
        optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='log')),
        metrics=['accuracy']
    )
    
    return model


In [42]:
# Define the Hyperparameter tuner for GRU model using RandomSearch
gru_tuner = kt.RandomSearch(
    build_gru_model,
    objective='val_accuracy',
    max_trials=5,  # Number of models to try
    executions_per_trial=3,  # Number of models to run for each trial
    directory='gru_tuning',
    project_name='gru_hyperparam_tuning'
)

# Perform the search using the training data
gru_tuner.search(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))

# Get the best hyperparameters
best_gru_hps = gru_tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best GRU Hyperparameters: {best_gru_hps}")


Trial 5 Complete [00h 00m 21s]
val_accuracy: 0.6108843485514323

Best val_accuracy So Far: 0.6108843485514323
Total elapsed time: 00h 01m 51s
Best GRU Hyperparameters: <keras_tuner.src.engine.hyperparameters.hyperparameters.HyperParameters object at 0x7dd438f62c90>


In [43]:
# Build the GRU model with the best hyperparameters
best_gru_model = gru_tuner.hypermodel.build(best_gru_hps)

# Train the GRU model with the best hyperparameters
history_gru = best_gru_model.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_pad, y_test)
)

Epoch 1/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.5306 - loss: 0.6938 - val_accuracy: 0.5143 - val_loss: 0.6917
Epoch 2/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6824 - loss: 0.6816 - val_accuracy: 0.5837 - val_loss: 0.6878
Epoch 3/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8708 - loss: 0.6404 - val_accuracy: 0.5551 - val_loss: 0.7045
Epoch 4/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8193 - loss: 0.4626 - val_accuracy: 0.5633 - val_loss: 0.7057
Epoch 5/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9735 - loss: 0.2479 - val_accuracy: 0.6163 - val_loss: 0.8350
Epoch 6/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9788 - loss: 0.0987 - val_accuracy: 0.5265 - val_loss: 0.8430
Epoch 7/20
[1m23/23[0m [32m━━━━━━━

In [44]:
# Evaluate the GRU model
y_pred_gru = (best_gru_model.predict(X_test_pad) > 0.5).astype('int32')

# Metrics
print("GRU Model Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_gru))
print("Precision:", precision_score(y_test, y_pred_gru))
print("Recall:", recall_score(y_test, y_pred_gru))
print("F1 Score:", f1_score(y_test, y_pred_gru))


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step
GRU Model Results:
Accuracy: 0.5551020408163265
Precision: 0.5407407407407407
Recall: 0.6083333333333333
F1 Score: 0.5725490196078431


**LSTM Implementation**

In [51]:
# Define model-building function for LSTM with tunable hyperparameters
def build_lstm_model(hp):
    model = Sequential()
    
    # Tune the embedding output dimension
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=hp.Int('embedding_dim', min_value=64, max_value=256, step=64), 
                        input_length=max_length))
    
    # Tune LSTM units and add Bidirectional LSTM
    model.add(Bidirectional(LSTM(
        units=hp.Int('lstm_units', min_value=64, max_value=256, step=64), 
        return_sequences=False)))
    
    # Tune dropout rate
    model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    
    # Output layer
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model with tunable learning rate
    model.compile(
        loss='binary_crossentropy', 
        optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='log')),
        metrics=['accuracy']
    )
    
    return model


In [52]:
# Define the Hyperparameter tuner for LSTM
tuner_lstm = kt.RandomSearch(
    build_lstm_model,
    objective='val_accuracy',
    max_trials=5,  # Number of models to try
    executions_per_trial=3,  # Number of models to run for each trial
    directory='lstm_tuning',
    project_name='lstm_hyperparam_tuning'
)


In [53]:
# Perform the search using the training data
tuner_lstm.search(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))


Trial 5 Complete [00h 00m 23s]
val_accuracy: 0.6122449040412903

Best val_accuracy So Far: 0.6122449040412903
Total elapsed time: 00h 01m 54s


In [54]:
# Get the best hyperparameters
best_hps_lstm = tuner_lstm.get_best_hyperparameters(num_trials=1)[0]
print(f"Best Hyperparameters for LSTM: {best_hps_lstm.values}")


Best Hyperparameters for LSTM: {'embedding_dim': 128, 'lstm_units': 192, 'dropout_rate': 0.4, 'learning_rate': 0.0008145957107201968}


In [55]:
# Build the model with the best hyperparameters
best_lstm_model = tuner_lstm.hypermodel.build(best_hps_lstm)

# Train the best LSTM model
history_lstm = best_lstm_model.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_pad, y_test)
)


Epoch 1/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.4918 - loss: 0.6944 - val_accuracy: 0.4898 - val_loss: 0.6926
Epoch 2/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5666 - loss: 0.6801 - val_accuracy: 0.5755 - val_loss: 0.6769
Epoch 3/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7900 - loss: 0.6202 - val_accuracy: 0.5878 - val_loss: 0.6922
Epoch 4/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9113 - loss: 0.4196 - val_accuracy: 0.5755 - val_loss: 0.7303
Epoch 5/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9720 - loss: 0.1393 - val_accuracy: 0.5592 - val_loss: 0.8320
Epoch 6/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9792 - loss: 0.0914 - val_accuracy: 0.5878 - val_loss: 1.0357
Epoch 7/20
[1m23/23[0m [32m━━━━

In [56]:
# Make predictions
y_pred_best_lstm = (best_lstm_model.predict(X_test_pad) > 0.5).astype('int32')

# Metrics
print("Best LSTM Model Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_best_lstm))
print("Precision:", precision_score(y_test, y_pred_best_lstm))
print("Recall:", recall_score(y_test, y_pred_best_lstm))
print("F1 Score:", f1_score(y_test, y_pred_best_lstm))

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step
Best LSTM Model Results:
Accuracy: 0.5673469387755102
Precision: 0.5460526315789473
Recall: 0.6916666666666667
F1 Score: 0.6102941176470589


**BiLSTM Implementation**

In [57]:
# Define model-building function for BiLSTM with tunable hyperparameters
def build_bilstm_model(hp):
    model = Sequential()
    
    # Tune the embedding output dimension
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=hp.Int('embedding_dim', min_value=64, max_value=256, step=64), 
                        input_length=max_length))
    
    # Tune LSTM units and add Bidirectional LSTM
    model.add(Bidirectional(LSTM(
        units=hp.Int('lstm_units', min_value=64, max_value=256, step=64), 
        return_sequences=False)))
    
    # Tune dropout rate
    model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    
    # Output layer
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model with tunable learning rate
    model.compile(
        loss='binary_crossentropy', 
        optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='log')),
        metrics=['accuracy']
    )
    
    return model


In [58]:
# Set up the Hyperparameter tuner for BiLSTM
tuner_bilstm = kt.RandomSearch(
    build_bilstm_model,  # Our model-building function
    objective='val_accuracy',  # Goal: maximize validation accuracy
    max_trials=5,  # Try 5 different sets of hyperparameters
    executions_per_trial=3,  # Run each set 3 times and average
    directory='bilstm_tuning',  # Folder to save results
    project_name='bilstm_hyperparam_tuning'
)

# Start the hyperparameter search
tuner_bilstm.search(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))


Trial 5 Complete [00h 00m 23s]
val_accuracy: 0.6108843485514323

Best val_accuracy So Far: 0.6299319863319397
Total elapsed time: 00h 01m 53s


In [59]:
# Get the best hyperparameters from the tuner
best_bilstm_hps = tuner_bilstm.get_best_hyperparameters(num_trials=1)[0]
print(f"Best Hyperparameters: {best_bilstm_hps}")


Best Hyperparameters: <keras_tuner.src.engine.hyperparameters.hyperparameters.HyperParameters object at 0x7dd438be0bd0>


In [60]:
# Build the best BiLSTM model using the found hyperparameters
best_bilstm_model = tuner_bilstm.hypermodel.build(best_bilstm_hps)

# Train the best model
history_bilstm = best_bilstm_model.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_pad, y_test)
)

Epoch 1/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.5223 - loss: 0.6934 - val_accuracy: 0.5306 - val_loss: 0.6893
Epoch 2/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6830 - loss: 0.6478 - val_accuracy: 0.5796 - val_loss: 0.6702
Epoch 3/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8763 - loss: 0.3935 - val_accuracy: 0.5714 - val_loss: 0.7145
Epoch 4/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9506 - loss: 0.2202 - val_accuracy: 0.5388 - val_loss: 0.9044
Epoch 5/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9876 - loss: 0.0822 - val_accuracy: 0.5796 - val_loss: 0.9942
Epoch 6/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9928 - loss: 0.0373 - val_accuracy: 0.5633 - val_loss: 1.2122
Epoch 7/20
[1m23/23[0m [32m━━━━

In [62]:
# Evaluate the trained model
y_pred_bilstm = (best_bilstm_model.predict(X_test_pad) > 0.5).astype('int32')

# Metrics
print("Best BiLSTM Model Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_bilstm))
print("Precision:", precision_score(y_test, y_pred_bilstm))
print("Recall:", recall_score(y_test, y_pred_bilstm))
print("F1 Score:", f1_score(y_test, y_pred_bilstm))

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Best BiLSTM Model Results:
Accuracy: 0.5959183673469388
Precision: 0.5695364238410596
Recall: 0.7166666666666667
F1 Score: 0.6346863468634687


# Transformer Models

**mBERT Implementation**

In [4]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # helps catch CUDA errors early

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split


In [3]:
# Load data
df = pd.read_csv('/kaggle/input/urduuusentiment/urdu-sentiment-corpus-v1.tsv', sep='\t')

# Remove rows with labels other than Positive (P) and Negative (N)
df = df[df['Class'].isin(['P', 'N'])].reset_index(drop=True)

# Encode labels: P -> 1 (positive), N -> 0 (negative)
df['label'] = df['Class'].map({'P': 1, 'N': 0})

print(df['label'].value_counts())  # Check distribution


label
0    499
1    480
Name: count, dtype: int64


In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Tweet'].tolist(),
    df['label'].tolist(),
    test_size=0.25,
    random_state=42,
    stratify=df['label']  # Maintain class balance
)


In [7]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenizing inside Dataset class


In [8]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)


In [9]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,      # Hyperparameter you can tune
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,      # You can increase for better results
    weight_decay=0.01,
    logging_dir="./logs",
    metric_for_best_model="f1",
    save_total_limit=1,
    report_to="none"
)


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


Step,Training Loss
500,0.1431


TrainOutput(global_step=920, training_loss=0.08613184897795967, metrics={'train_runtime': 248.0593, 'train_samples_per_second': 59.179, 'train_steps_per_second': 3.709, 'total_flos': 965617573171200.0, 'train_loss': 0.08613184897795967, 'epoch': 20.0})

In [16]:
trainer.evaluate()


{'eval_loss': 2.1721458435058594,
 'eval_accuracy': 0.6653061224489796,
 'eval_precision': 0.6610169491525424,
 'eval_recall': 0.65,
 'eval_f1': 0.6554621848739496,
 'eval_runtime': 1.1278,
 'eval_samples_per_second': 217.229,
 'eval_steps_per_second': 14.186,
 'epoch': 20.0}

**XLM-RoBERTa Implementation**

In [18]:
from transformers import XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import XLMRobertaTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
import pandas as pd
import torch

In [5]:
# Load your dataset
df = pd.read_csv('/kaggle/input/urduuusentiment/urdu-sentiment-corpus-v1.tsv', sep='\t')
df = df[['Tweet', 'Class']].dropna()
df = df[df['Class'].isin(['P', 'N'])]  # Only keep Positive/Negative

# Encode labels
label_mapping = {'P': 1, 'N': 0}
df['label'] = df['Class'].map(label_mapping)

In [6]:
# Split into train and validation
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Tweet'].tolist(),
    df['label'].tolist(),
    test_size=0.25,
    random_state=42,
    stratify=df['label']
)


In [7]:
# Load tokenizer
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [8]:
# Tokenization
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


In [9]:
# Prepare dataset
class UrduSentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

train_dataset = UrduSentimentDataset(train_encodings, train_labels)
val_dataset = UrduSentimentDataset(val_encodings, val_labels)


In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [14]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,         
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,     
    logging_strategy="steps",
    logging_steps=10,                   # <-- Log after every 10 steps
    save_total_limit=1,
    metric_for_best_model="accuracy",
    report_to="none",
)


In [15]:
# Load model
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()



Step,Training Loss
10,0.6972
20,0.6947
30,0.6917
40,0.6741
50,0.6662
60,0.5963
70,0.5659
80,0.5004
90,0.5375
100,0.4783


TrainOutput(global_step=460, training_loss=0.2627279550484989, metrics={'train_runtime': 217.4972, 'train_samples_per_second': 67.495, 'train_steps_per_second': 2.115, 'total_flos': 460177124714400.0, 'train_loss': 0.2627279550484989, 'epoch': 20.0})

In [19]:
metrics = trainer.evaluate()
print(metrics)




{'eval_loss': 1.6473727226257324, 'eval_accuracy': 0.710204081632653, 'eval_precision': 0.7024793388429752, 'eval_recall': 0.7083333333333334, 'eval_f1': 0.7053941908713693, 'eval_runtime': 1.0752, 'eval_samples_per_second': 227.864, 'eval_steps_per_second': 7.44, 'epoch': 20.0}


# Q2) Sentiment Analysis using word embedding

In [21]:
# Preprocessing Urdu Tweets
import pandas as pd
import re

# Load your dataset
df = pd.read_csv('/kaggle/input/urduuusentiment/urdu-sentiment-corpus-v1.tsv', sep='\t')

# Basic text cleaning function
def clean_text(text):
    text = str(text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)       # Remove numbers
    text = text.strip()                   # Remove leading/trailing whitespace
    return text

# Apply cleaning
df['Tweet_clean'] = df['Tweet'].apply(clean_text)

# View cleaned tweets
print(df[['Tweet', 'Tweet_clean']].head())


                                               Tweet  \
0  میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...   
1  چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...   
2                           ٹویٹر کا خیال کیسے آیا ؟   
3  سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...   
4    ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ   

                                         Tweet_clean  
0  میں نے ایٹم بم بنایا ھے او بھائی ایٹم بمب کوٹ ...  
1  چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...  
2                             ٹویٹر کا خیال کیسے آیا  
3  سرچ انجن گوگل کے نائب صدر نے فضا میں   فٹ کی ب...  
4      ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار أ  


In [22]:
# Tokenization of Tweets

# Simple Urdu word tokenizer (split by space for now)
df['tokens'] = df['Tweet_clean'].apply(lambda x: x.split())

# Final tokenized sentences
sentences = df['tokens'].tolist()

# Example
print(sentences[:5])


[['میں', 'نے', 'ایٹم', 'بم', 'بنایا', 'ھے', 'او', 'بھائی', 'ایٹم', 'بمب', 'کوٹ', 'لکھپت', 'والی', 'اتفاق', 'فیکٹری', 'میں', 'نہیں', 'بنتاایٹم', 'بم', 'کہوٹہ', 'کی', 'ایٹمی'], ['چندے', 'سے', 'انقلاب', 'اور', 'عمران', 'خان', 'وزیر', 'اعظم', 'نہیں', 'بن', 'سکتے'], ['ٹویٹر', 'کا', 'خیال', 'کیسے', 'آیا'], ['سرچ', 'انجن', 'گوگل', 'کے', 'نائب', 'صدر', 'نے', 'فضا', 'میں', 'فٹ', 'کی', 'بلندی', 'پر', 'چھلانگ', 'لگا', 'کر', 'عالمی', 'ریکارڈ', 'قائم', 'کرلیا', 'چھلانگ', 'کی'], ['ابھی', 'تک', 'اسکی', 'لہریں', 'کبھی', 'کبھی', 'آ', 'جاتی', 'ہیں', 'یار', 'أ']]


In [23]:
from gensim.models import Word2Vec

# Train Word2Vec
w2v_model = Word2Vec(sentences, vector_size=300, window=5, min_count=1, workers=4, epochs=30)

# Save the model
w2v_model.save("urdu_word2vec.model")

# Example: Get vector for a word
print(w2v_model.wv['خان'])  # example Urdu word


[-1.51141837e-01  5.41772008e-01  2.15594098e-01  2.17439607e-01
  2.77114974e-04 -5.33531427e-01  3.57072443e-01  1.21154511e+00
  2.52480991e-02 -2.48894647e-01  2.55344599e-01 -3.82782698e-01
 -3.76747400e-02  1.21510193e-01 -4.42663819e-01 -3.47939938e-01
  3.30831140e-01 -2.58046445e-02  2.54350305e-01 -2.24813491e-01
 -3.45094889e-01 -1.91343725e-01  2.69015878e-01  9.55679864e-02
  4.15248722e-01 -3.27576734e-02 -6.13531768e-01  3.21235619e-02
 -1.84451342e-01 -5.04071534e-01  1.60372481e-01 -3.59600812e-01
  1.76599786e-01 -4.31130454e-02 -8.64950418e-02  1.81792781e-01
  2.13999197e-01 -6.41518533e-01 -8.83256942e-02 -1.70747653e-01
 -1.06607832e-01 -2.13540252e-02  6.95098937e-02 -2.98653066e-01
  3.22362304e-01  4.45875883e-01  3.84498052e-02  2.55491197e-01
 -1.28287047e-01  2.93594718e-01  1.32580414e-01 -6.25952659e-03
 -4.24361706e-01  1.82349056e-01 -6.96496218e-02  4.49375123e-01
  1.54175714e-01 -5.50237522e-02  2.41108850e-01 -4.93541807e-02
 -1.31593496e-01 -5.62646

In [24]:
from gensim.models import FastText

# Train FastText
ft_model = FastText(sentences, vector_size=300, window=5, min_count=1, workers=4, epochs=30)

# Save the model
ft_model.save("urdu_fasttext.model")

# Example
print(ft_model.wv['خان'])


[ 0.7710025  -0.00761759 -0.7132654  -0.10891396 -0.82844436 -0.0724377
  0.33335805  1.0826792  -0.04404404  0.21507658 -0.40715948 -0.08789742
  0.10447229  0.13141426  0.09518748 -0.53553563 -0.40565902 -0.36894134
 -0.27015433 -0.24066754  0.12243818  0.6302248  -0.0422199   0.6447443
  0.51001173  0.27972725  0.05001359  0.3714595  -0.14511628 -0.09012616
  0.2187682  -0.38321766 -0.04787967  0.35637277  0.17888358 -0.00199761
  0.25056022 -0.5560907   0.5798305  -0.36930317  0.42911673  0.05930861
  0.11312318 -0.24293756  0.16484189  0.14753099  0.2129542  -0.6124335
  0.10325637  0.73409545  0.11598943  0.08297802 -0.12618081  0.7337373
  0.75490665  0.6171598   0.34810203  0.02325491  0.5853562   0.44657072
 -0.42837605 -0.42833647 -0.23218744  0.3663659   0.7757973   0.04900873
 -0.38509688  0.40972418 -0.4584365   0.69155353 -0.38657925  0.10676061
  0.11707739 -0.1094055  -0.2292334   0.3644038  -0.16262412  0.08662166
 -0.10579     0.39030442  0.20970008 -0.5205685  -0.128

In [60]:
from gensim.models import KeyedVectors

# Load pre-trained GloVe vectors (make sure to use the correct path)
glove_file = '/kaggle/input/glove-6b-100-d/glove.6B.100d.txt'  # Adjust the path accordingly
glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

# Example: Retrieve vector for the word 'خان' (a common Urdu word)
word_vector = glove_model['خان']
print(word_vector)


[-0.59159    1.1582     0.5159    -0.089872  -0.30491   -0.32017
 -0.24737   -0.23799    0.30642    0.27633    0.72432   -0.3281
 -0.59546    0.6052    -0.11869    0.83671    0.17238   -0.13105
  0.44077    0.70813   -0.22314   -0.55746   -0.0037007  0.017325
 -0.072511   1.3471    -0.052981   0.0084129  0.56635    0.070343
 -0.46641    0.1831    -0.31445   -0.73833    0.39099   -0.92103
  0.43325   -0.69386    0.14133    0.79592    0.11744    0.24983
  0.11663   -0.25647    0.422     -0.90908   -0.49173    0.23067
  0.44652    0.12588    0.28967    0.26124    0.26236   -0.13052
 -0.30259    0.25015   -0.64525   -0.092605  -0.89275   -0.58147
 -0.020689   0.019705  -0.28225   -0.88845   -0.48147    0.30782
 -0.14298   -1.0383    -0.18129    0.61952   -0.48386    1.1028
  0.25364    0.30245    0.11214   -0.7666     0.018146  -0.068516
  0.14089   -0.42505   -0.62971   -0.84632    0.21139   -0.35575
  0.39461    0.14247   -0.064626  -0.56695   -0.16673    0.018844
 -0.065712   0.6176    

In [90]:
# Target labels
y = df['label']  # Make sure your label column is named 'label'


In [91]:
import numpy as np

def get_average_embedding(tokens, model, vector_size):
    vectors = []
    for token in tokens:
        if token in model.wv:
            vectors.append(model.wv[token])
    if len(vectors) == 0:
        return np.zeros(vector_size)
    else:
        return np.mean(vectors, axis=0)


In [92]:
# For Word2Vec (300 dimensions)
X_w2v = np.array([get_average_embedding(tokens, w2v_model, 300) for tokens in df['tokens']])
print(X_w2v.shape)  # (number_of_tweets, 300)


(979, 300)


In [93]:
# For FastText (300 dimensions)
X_fasttext = np.array([get_average_embedding(tokens, ft_model, 300) for tokens in df['tokens']])
print(X_fasttext.shape)  # (number_of_tweets, 300)


(979, 300)


In [94]:
def get_average_embedding_glove(tokens, model, vector_size):
    vectors = []
    for token in tokens:
        if token in model:
            vectors.append(model[token])
    if len(vectors) == 0:
        return np.zeros(vector_size)
    else:
        return np.mean(vectors, axis=0)


In [95]:
# For GloVe (100 dimensions)
X_glove = np.array([get_average_embedding_glove(tokens, glove_model, 100) for tokens in df['tokens']])
print(X_glove.shape)  # (number_of_tweets, 100)


(979, 100)


In [96]:
from sklearn.model_selection import train_test_split

# Word2Vec
X_w2v_train, X_w2v_test, y_w2v_train, y_w2v_test = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

# FastText
X_fasttext_train, X_fasttext_test, y_fasttext_train, y_fasttext_test = train_test_split(X_fasttext, y, test_size=0.2, random_state=42)

# GloVe
X_glove_train, X_glove_test, y_glove_train, y_glove_test = train_test_split(X_glove, y, test_size=0.2, random_state=42)


In [97]:
from sklearn.linear_model import LogisticRegression

# Word2Vec model
lr_w2v = LogisticRegression(max_iter=1000)
lr_w2v.fit(X_w2v_train, y_w2v_train)

# FastText model
lr_fasttext = LogisticRegression(max_iter=1000)
lr_fasttext.fit(X_fasttext_train, y_fasttext_train)

# GloVe model
lr_glove = LogisticRegression(max_iter=1000)
lr_glove.fit(X_glove_train, y_glove_train)


In [98]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Word2Vec evaluation
y_w2v_pred = lr_w2v.predict(X_w2v_test)
print("Word2Vec Accuracy:", accuracy_score(y_w2v_test, y_w2v_pred))
print("Word2Vec F1 Score:", f1_score(y_w2v_test, y_w2v_pred, average='weighted'))
print(classification_report(y_w2v_test, y_w2v_pred))

# FastText evaluation
y_fasttext_pred = lr_fasttext.predict(X_fasttext_test)
print("\nFastText Accuracy:", accuracy_score(y_fasttext_test, y_fasttext_pred))
print("FastText F1 Score:", f1_score(y_fasttext_test, y_fasttext_pred, average='weighted'))
print(classification_report(y_fasttext_test, y_fasttext_pred))

# GloVe evaluation
y_glove_pred = lr_glove.predict(X_glove_test)
print("\nGloVe Accuracy:", accuracy_score(y_glove_test, y_glove_pred))
print("GloVe F1 Score:", f1_score(y_glove_test, y_glove_pred, average='weighted'))
print(classification_report(y_glove_test, y_glove_pred))


Word2Vec Accuracy: 0.5663265306122449
Word2Vec F1 Score: 0.5515374908680348
              precision    recall  f1-score   support

         0.0       0.55      0.37      0.44        91
         1.0       0.57      0.73      0.64       105

    accuracy                           0.57       196
   macro avg       0.56      0.55      0.54       196
weighted avg       0.56      0.57      0.55       196


FastText Accuracy: 0.5612244897959183
FastText F1 Score: 0.5369463645673322
              precision    recall  f1-score   support

         0.0       0.55      0.32      0.40        91
         1.0       0.57      0.77      0.65       105

    accuracy                           0.56       196
   macro avg       0.56      0.55      0.53       196
weighted avg       0.56      0.56      0.54       196


GloVe Accuracy: 0.5510204081632653
GloVe F1 Score: 0.4497200912295253
              precision    recall  f1-score   support

         0.0       0.60      0.10      0.17        91
         1.0 

In [81]:
!pip install transformers sentence-transformers


Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.11.0->sentence-transformers)
  Downloading nvid

In [82]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch
from tqdm import tqdm


In [87]:
from sklearn.linear_model import LogisticRegression


In [83]:
# Load XLM-Roberta tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmr_model = XLMRobertaModel.from_pretrained('xlm-roberta-base')


In [84]:
# Device: GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
xlmr_model = xlmr_model.to(device)

def get_xlmr_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = xlmr_model(**inputs)
    # Take mean of the token embeddings (mean pooling)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.cpu().numpy().flatten()

# Generate embeddings for all tweets
xlmr_embeddings = []
for text in tqdm(df['Tweet_clean']):
    emb = get_xlmr_embedding(text)
    xlmr_embeddings.append(emb)

# Convert to numpy array
import numpy as np
X_xlmr = np.array(xlmr_embeddings)
print(X_xlmr.shape)  # (num_samples, 768)


100%|██████████| 979/979 [00:08<00:00, 119.37it/s]

(979, 768)





In [85]:
X_xlmr_train, X_xlmr_test, y_xlmr_train, y_xlmr_test = train_test_split(X_xlmr, y, test_size=0.2, random_state=42)


In [88]:
lr_xlmr = LogisticRegression(max_iter=1000)
lr_xlmr.fit(X_xlmr_train, y_xlmr_train)


In [89]:
# Evaluation
y_xlmr_pred = lr_xlmr.predict(X_xlmr_test)
print("XLM-R Accuracy:", accuracy_score(y_xlmr_test, y_xlmr_pred))
print("XLM-R F1 Score:", f1_score(y_xlmr_test, y_xlmr_pred, average='weighted'))
print(classification_report(y_xlmr_test, y_xlmr_pred))


XLM-R Accuracy: 0.6989795918367347
XLM-R F1 Score: 0.6978143515470705
              precision    recall  f1-score   support

         0.0       0.69      0.64      0.66        91
         1.0       0.71      0.75      0.73       105

    accuracy                           0.70       196
   macro avg       0.70      0.69      0.70       196
weighted avg       0.70      0.70      0.70       196

