### Approach 2 (with punctuation): CNN 

In [1]:
import pandas as pd

In [2]:
# Read the CSV file
df = pd.read_csv('/kaggle/input/sbd-data/dataset.csv')

df.head()

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,1,اس,یہ,DET,DEM,2,det,0,2
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,3,کی,کا,ADP,PSP,2,case,9,11
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22


In [3]:
data = df.drop(columns=['id'])

In [4]:
import numpy as np

# Create a new column 'y' with default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_E' and 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        data.loc[i, 'y'] = 'S_E'  # Sentence End
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Convert 'y' column to categorical type (optional, for ML efficiency)
data['y'] = data['y'].astype('category')

# Display the first few rows to verify
data.head(10)


Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,S_M
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,S_M
2,کی,کا,ADP,PSP,2,case,9,11,S_M
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,S_M
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,S_M
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,S_M
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,S_M
7,۔,۔,PUNCT,SYM,7,punct,33,34,S_E
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,S_B
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,S_M


In [5]:
import numpy as np

# Create a new column 'y' with default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_E' and 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        data.loc[i, 'y'] = 'S_E'  # Sentence End
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Map categorical labels to numeric values
label_mapping = {'S_E': 0, 'S_B': 1, 'S_M': 2}
data['y'] = data['y'].map(label_mapping)

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,2
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,2
2,کی,کا,ADP,PSP,2,case,9,11,2
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,2
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,2
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,2
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,2
7,۔,۔,PUNCT,SYM,7,punct,33,34,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,2


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [7]:
# One-hot encode 'upos', 'xpos', and 'deprel'

encoder = OneHotEncoder(sparse_output=False)
encoded_cats = encoder.fit_transform(data[['upos', 'xpos', 'deprel']])

# Convert to DataFrame for easier merging
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out())

# Concatenate encoded features back to the dataset
data = pd.concat([data.reset_index(drop=True), encoded_cats_df], axis=1)

# Drop the original categorical columns (optional)
data = data.drop(columns=['upos', 'xpos', 'deprel'])

In [8]:
# Select the numerical features to normalize
numerical_features = ['start_char', 'end_char', 'head']

# Option 2: Standard Scaling (zero mean and unit variance)
standard_scaler = StandardScaler()
data[numerical_features] = standard_scaler.fit_transform(data[numerical_features])

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine text and lemma columns into a single string representation (if needed)
data['text_lemma'] = data['text'] + " " + data['lemma']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Adjust max_features as needed

# Fit and transform the combined text and lemma
tfidf_features = tfidf_vectorizer.fit_transform(data['text_lemma'])

# Convert the sparse matrix to a DataFrame for better integration
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Add the TF-IDF features back to the original DataFrame
data = pd.concat([data.reset_index(drop=True), tfidf_df], axis=1)

# Drop the original text and lemma columns (optional)
data = data.drop(columns=['text', 'lemma', 'text_lemma'])

In [10]:
data

Unnamed: 0,head,start_char,end_char,y,upos_ADJ,upos_ADP,upos_ADV,upos_AUX,upos_CCONJ,upos_DET,...,ہوں,ہی,ہیں,ہے,یا,یعنی,یقینا,یہ,یہاں,یہی
0,-1.051740,-1.732661,-1.732666,2,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.646811,0.0,0.0
1,-0.857181,-1.732652,-1.732647,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,-1.051740,-1.732634,-1.732638,2,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,-0.857181,-1.732625,-1.732623,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,-0.727474,-1.732609,-1.732604,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254923,2.580032,1.737197,1.737196,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
254924,1.672089,1.737209,1.737214,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
254925,2.580032,1.737228,1.737236,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
254926,2.580032,1.737249,1.737245,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [11]:
# Assuming 'y' is your target column
class_counts = data['y'].value_counts()
print(class_counts)


y
2    236746
0      9091
1      9091
Name: count, dtype: int64


In [12]:
# Define the feature matrix (drop 'y') and target
X = data.drop(columns=['y'])
y = data['y']

In [13]:
# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.36, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.56, random_state=42, stratify=y_temp)


In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, f1_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras


In [17]:
# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



In [19]:
# Compute class weights for weighted loss
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

In [20]:
# Define the CNN model
def create_cnn_model_multiclass(input_dim, num_classes):
    model = keras.Sequential([
        layers.InputLayer(input_shape=(input_dim,)),
        layers.Reshape((input_dim, 1)),
        layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
        layers.GlobalMaxPooling1D(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model


In [21]:
# Create and train the model
num_classes = len(np.unique(y))
cnn_model_multi = create_cnn_model_multiclass(X_train_resampled.shape[1], num_classes)
cnn_model_multi.fit(
    X_train_resampled, y_train_resampled,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights_dict
)



Epoch 1/10


I0000 00:00:1734368000.414284     105 service.cc:145] XLA service 0x7c36a0006360 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1734368000.414360     105 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1734368000.414367     105 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m   44/14205[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m52s[0m 4ms/step - accuracy: 0.3363 - loss: 5.9109  

I0000 00:00:1734368004.352604     105 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m14205/14205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 4ms/step - accuracy: 0.6391 - loss: 2.2145 - val_accuracy: 0.3550 - val_loss: 2.6814
Epoch 2/10
[1m14205/14205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 4ms/step - accuracy: 0.7220 - loss: 1.6440 - val_accuracy: 0.3962 - val_loss: 2.5138
Epoch 3/10
[1m14205/14205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 4ms/step - accuracy: 0.7344 - loss: 1.5771 - val_accuracy: 0.3929 - val_loss: 2.4694
Epoch 4/10
[1m14205/14205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 4ms/step - accuracy: 0.7392 - loss: 1.5701 - val_accuracy: 0.3680 - val_loss: 2.4683
Epoch 5/10
[1m14205/14205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 4ms/step - accuracy: 0.7438 - loss: 1.5434 - val_accuracy: 0.4425 - val_loss: 2.2179
Epoch 6/10
[1m14205/14205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 4ms/step - accuracy: 0.7476 - loss: 1.5403 - val_accuracy: 0.4238 - val_loss: 2.2319
Epoch 7/1

<keras.src.callbacks.history.History at 0x7c3736cc4880>

In [22]:
# Evaluate on validation data
val_loss, val_accuracy = cnn_model_multi.evaluate(X_val, y_val, verbose=0)
print(f'Validation Accuracy: {val_accuracy:.4f}')


Validation Accuracy: 0.4262


In [23]:
# Predict and evaluate on validation set
y_val_pred = cnn_model_multi.predict(X_val).argmax(axis=1)
print("Classification Report (Validation):\n", classification_report(y_val, y_val_pred))

[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.10      0.95      0.18      1440
           1       0.10      0.87      0.18      1440
           2       1.00      0.39      0.56     37500

    accuracy                           0.43     40380
   macro avg       0.40      0.74      0.31     40380
weighted avg       0.94      0.43      0.53     40380



In [24]:
# Evaluate on test data
test_loss, test_accuracy = cnn_model_multi.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {test_accuracy:.4f}')

Test Accuracy: 0.4239


In [25]:
# Predict and evaluate on test set
y_test_pred = cnn_model_multi.predict(X_test).argmax(axis=1)
print("Classification Report (Test):\n", classification_report(y_test, y_test_pred))

[1m1607/1607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.10      0.96      0.18      1833
           1       0.10      0.87      0.18      1833
           2       1.00      0.39      0.56     47729

    accuracy                           0.42     51395
   macro avg       0.40      0.74      0.31     51395
weighted avg       0.94      0.42      0.53     51395

