### Approach 3: LSTM

In [24]:
import pandas as pd

In [25]:
# Read the CSV file
df = pd.read_csv('/kaggle/input/sbd-data/dataset.csv')

df.head()

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,1,اس,یہ,DET,DEM,2,det,0,2
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,3,کی,کا,ADP,PSP,2,case,9,11
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22


In [26]:
data = df.drop(columns=['id'])

In [27]:
import string
import re

In [28]:
# Initialize a new column 'y' with the default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Convert 'y' column to categorical type (optional, for ML efficiency)
data['y'] = data['y'].astype('category')

# Map categorical labels to numeric values
label_mapping = {'S_B': 1, 'S_M': 0}
data['y'] = data['y'].map(label_mapping)

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
7,۔,۔,PUNCT,SYM,7,punct,33,34,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0


In [29]:
# Drop rows where the 'text' column contains only punctuation
data = data[~data['text'].str.contains(r'^[^\w\s]+$', na=False)]

# Verify the result
data.head(10)


Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0
10,ممکن,ممکن,ADJ,JJ,0,root,44,48,0


In [30]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [31]:
# One-hot encode 'upos', 'xpos', and 'deprel'
encoder = OneHotEncoder(sparse_output=False)
encoded_cats = encoder.fit_transform(data[['upos', 'xpos', 'deprel']])

# Convert to DataFrame for easier merging
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out())

# Concatenate encoded features back to the dataset
data = pd.concat([data.reset_index(drop=True), encoded_cats_df], axis=1)

# Drop the original categorical columns (optional)
data = data.drop(columns=['upos', 'xpos', 'deprel'])

In [32]:
# Select the numerical features to normalize
numerical_features = ['start_char', 'end_char', 'head']

from sklearn.preprocessing import MinMaxScaler

# Option 2: Min-Max Scaling (scales features to a range, typically 0 to 1)
min_max_scaler = MinMaxScaler()
data[numerical_features] = min_max_scaler.fit_transform(data[numerical_features])

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine text and lemma columns into a single string representation (if needed)
data['text_lemma'] = data['text'] + " " + data['lemma']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Adjust max_features as needed

# Fit and transform the combined text and lemma
tfidf_features = tfidf_vectorizer.fit_transform(data['text_lemma'])

# Convert the sparse matrix to a DataFrame for better integration
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Add the TF-IDF features back to the original DataFrame
data = pd.concat([data.reset_index(drop=True), tfidf_df], axis=1)

# Drop the original text and lemma columns (optional)
data = data.drop(columns=['text', 'lemma', 'text_lemma'])

In [34]:
data.head(10)

Unnamed: 0,head,start_char,end_char,y,upos_ADJ,upos_ADP,upos_ADV,upos_AUX,upos_CCONJ,upos_DET,...,ہوں,ہی,ہیں,ہے,یا,یعنی,یقینا,یہ,یہاں,یہی
0,0.014493,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.646062,0.0,0.0
1,0.036232,3e-06,5e-06,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.014493,8e-06,8e-06,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.036232,1.1e-05,1.2e-05,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.050725,1.5e-05,1.8e-05,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.050725,2e-05,2.2e-05,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,0.0,2.5e-05,2.7e-05,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.021739,3.2e-05,3.2e-05,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,0.021739,3.5e-05,3.6e-05,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,3.9e-05,4.1e-05,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Define the feature matrix (drop 'y') and target
X = data.drop(columns=['y'])
y = data['y']


In [36]:
# Split into training (64%), validation (16%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.36, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.56, random_state=42, stratify=y_temp)

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [38]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
# Compute class weights based on the class distribution in the target variable y
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))


In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

In [40]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [41]:
# Reshape the resampled data for LSTM input (3D tensor: [samples, timesteps, features])
X_train_resampled_lstm = X_train_resampled.values.reshape(X_train_resampled.shape[0], 1, X_train_resampled.shape[1])
X_val_lstm = X_val.values.reshape(X_val.shape[0], 1, X_val.shape[1])
X_test_lstm = X_test.values.reshape(X_test.shape[0], 1, X_test.shape[1])

In [42]:
# Convert target variables to supported dtype
y_train_resampled = y_train_resampled.to_numpy(dtype='int32')
y_val = y_val.to_numpy(dtype='int32')
y_test = y_test.to_numpy(dtype='int32')

In [43]:
# Ensure feature data is in float32
def ensure_float32(data):
    return data.astype('float32')

X_train_resampled_lstm = ensure_float32(X_train_resampled_lstm)
X_val_lstm = ensure_float32(X_val_lstm)
X_test_lstm = ensure_float32(X_test_lstm)


In [45]:
# Update the output layer of the LSTM model
model = Sequential([
    LSTM(128, input_shape=(X_train_resampled_lstm.shape[1], X_train_resampled_lstm.shape[2]), return_sequences=False),
    BatchNormalization(),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(2, activation='softmax')  # Updated to 2 output classes
])

# Compile the model with the appropriate loss function
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with class weights and resampled data
history = model.fit(
    X_train_resampled_lstm, y_train_resampled,
    validation_data=(X_val_lstm, y_val),
    epochs=10,
    batch_size=64,
    class_weight=class_weight_dict,  # Add class weights here
    callbacks=[early_stopping],
    verbose=2
)


  super().__init__(**kwargs)


Epoch 1/10
4636/4636 - 26s - 6ms/step - accuracy: 0.8370 - loss: 0.3781 - val_accuracy: 0.7656 - val_loss: 0.6989
Epoch 2/10
4636/4636 - 22s - 5ms/step - accuracy: 0.8742 - loss: 0.2841 - val_accuracy: 0.7996 - val_loss: 0.6834
Epoch 3/10
4636/4636 - 22s - 5ms/step - accuracy: 0.8850 - loss: 0.2589 - val_accuracy: 0.7972 - val_loss: 0.6636
Epoch 4/10
4636/4636 - 22s - 5ms/step - accuracy: 0.8901 - loss: 0.2470 - val_accuracy: 0.7915 - val_loss: 0.6616
Epoch 5/10
4636/4636 - 22s - 5ms/step - accuracy: 0.8935 - loss: 0.2373 - val_accuracy: 0.7997 - val_loss: 0.6035
Epoch 6/10
4636/4636 - 22s - 5ms/step - accuracy: 0.8951 - loss: 0.2339 - val_accuracy: 0.8213 - val_loss: 0.5932
Epoch 7/10
4636/4636 - 21s - 5ms/step - accuracy: 0.8986 - loss: 0.2275 - val_accuracy: 0.8089 - val_loss: 0.5783
Epoch 8/10
4636/4636 - 22s - 5ms/step - accuracy: 0.8994 - loss: 0.2236 - val_accuracy: 0.8097 - val_loss: 0.5859
Epoch 9/10
4636/4636 - 22s - 5ms/step - accuracy: 0.9011 - loss: 0.2198 - val_accuracy: 

In [46]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_lstm, y_test, verbose=2)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Make predictions on the test set
y_pred = model.predict(X_test_lstm)
y_pred_classes = np.argmax(y_pred, axis=1)  # Get the predicted class labels

# Evaluate performance with precision, recall, and F1-score
print(classification_report(y_test, y_pred_classes))
print(confusion_matrix(y_test, y_pred_classes))


1517/1517 - 3s - 2ms/step - accuracy: 0.8112 - loss: 0.5706
Test Loss: 0.5705779790878296
Test Accuracy: 0.8111816048622131
[1m1517/1517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
              precision    recall  f1-score   support

           0       1.00      0.80      0.89     46727
           1       0.16      0.98      0.28      1817

    accuracy                           0.81     48544
   macro avg       0.58      0.89      0.59     48544
weighted avg       0.97      0.81      0.87     48544

[[37605  9122]
 [   44  1773]]
