### Approach 2 (with punctuation)

#### XGB and DNN

In [15]:
import pandas as pd

In [16]:
# Read the CSV file
df = pd.read_csv('dataset.csv')

df.head()

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,1,اس,یہ,DET,DEM,2,det,0,2
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,3,کی,کا,ADP,PSP,2,case,9,11
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22


In [17]:
data = df.drop(columns=['id'])

In [18]:
import numpy as np

# Create a new column 'y' with default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_E' and 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        data.loc[i, 'y'] = 'S_E'  # Sentence End
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Convert 'y' column to categorical type (optional, for ML efficiency)
data['y'] = data['y'].astype('category')

# Display the first few rows to verify
data.head(10)


Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,S_M
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,S_M
2,کی,کا,ADP,PSP,2,case,9,11,S_M
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,S_M
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,S_M
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,S_M
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,S_M
7,۔,۔,PUNCT,SYM,7,punct,33,34,S_E
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,S_B
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,S_M


In [19]:
import numpy as np

# Create a new column 'y' with default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_E' and 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        data.loc[i, 'y'] = 'S_E'  # Sentence End
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Map categorical labels to numeric values
label_mapping = {'S_E': 0, 'S_B': 1, 'S_M': 2}
data['y'] = data['y'].map(label_mapping)

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,2
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,2
2,کی,کا,ADP,PSP,2,case,9,11,2
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,2
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,2
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,2
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,2
7,۔,۔,PUNCT,SYM,7,punct,33,34,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,2


In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [21]:
# One-hot encode 'upos', 'xpos', and 'deprel'

encoder = OneHotEncoder(sparse_output=False)
encoded_cats = encoder.fit_transform(data[['upos', 'xpos', 'deprel']])

# Convert to DataFrame for easier merging
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out())

# Concatenate encoded features back to the dataset
data = pd.concat([data.reset_index(drop=True), encoded_cats_df], axis=1)

# Drop the original categorical columns (optional)
data = data.drop(columns=['upos', 'xpos', 'deprel'])

In [22]:
# Select the numerical features to normalize
numerical_features = ['start_char', 'end_char', 'head']

# Option 2: Standard Scaling (zero mean and unit variance)
standard_scaler = StandardScaler()
data[numerical_features] = standard_scaler.fit_transform(data[numerical_features])

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine text and lemma columns into a single string representation (if needed)
data['text_lemma'] = data['text'] + " " + data['lemma']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Adjust max_features as needed

# Fit and transform the combined text and lemma
tfidf_features = tfidf_vectorizer.fit_transform(data['text_lemma'])

# Convert the sparse matrix to a DataFrame for better integration
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Add the TF-IDF features back to the original DataFrame
data = pd.concat([data.reset_index(drop=True), tfidf_df], axis=1)

# Drop the original text and lemma columns (optional)
data = data.drop(columns=['text', 'lemma', 'text_lemma'])

In [24]:
data

Unnamed: 0,head,start_char,end_char,y,upos_ADJ,upos_ADP,upos_ADV,upos_AUX,upos_CCONJ,upos_DET,...,ہوں,ہی,ہیں,ہے,یا,یعنی,یقینا,یہ,یہاں,یہی
0,-1.051740,-1.732661,-1.732666,2,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.646811,0.0,0.0
1,-0.857181,-1.732652,-1.732647,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,-1.051740,-1.732634,-1.732638,2,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,-0.857181,-1.732625,-1.732623,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,-0.727474,-1.732609,-1.732604,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254923,2.580032,1.737197,1.737196,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
254924,1.672089,1.737209,1.737214,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
254925,2.580032,1.737228,1.737236,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
254926,2.580032,1.737249,1.737245,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [25]:
# Define the feature matrix (drop 'y') and target
X = data.drop(columns=['y'])
y = data['y']

In [26]:
# Split into training (64%), validation (16%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.36, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.56, random_state=42, stratify=y_temp)


### XGBoost  With MinMaxScalar

In [14]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # Since we have multiple classes
    num_class=3,  # Number of classes: S_E, S_B, and S_M
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = xgb_model.predict(X_val)

# Evaluate the model
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print("Classification Report:\n", classification_report(y_val, y_val_pred))

# Make predictions on the test set (optional)
y_test_pred = xgb_model.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

Validation Accuracy: 0.9624
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.92      0.79      1440
           1       0.82      0.55      0.66      1440
           2       0.98      0.98      0.98     37500

    accuracy                           0.96     40380
   macro avg       0.83      0.82      0.81     40380
weighted avg       0.96      0.96      0.96     40380

Test Accuracy: 0.9617
Test Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.92      0.79      1833
           1       0.79      0.55      0.65      1833
           2       0.98      0.98      0.98     47729

    accuracy                           0.96     51395
   macro avg       0.82      0.82      0.81     51395
weighted avg       0.96      0.96      0.96     51395



### XGBoost  With standard scalar 

In [27]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # Since we have multiple classes
    num_class=3,  # Number of classes: S_E, S_B, and S_M
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = xgb_model.predict(X_val)

# Evaluate the model
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print("Classification Report:\n", classification_report(y_val, y_val_pred))

# Make predictions on the test set (optional)
y_test_pred = xgb_model.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))


Validation Accuracy: 0.9624
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.92      0.79      1440
           1       0.82      0.55      0.66      1440
           2       0.98      0.98      0.98     37500

    accuracy                           0.96     40380
   macro avg       0.83      0.82      0.81     40380
weighted avg       0.96      0.96      0.96     40380

Test Accuracy: 0.9617
Test Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.92      0.79      1833
           1       0.79      0.55      0.65      1833
           2       0.98      0.98      0.98     47729

    accuracy                           0.96     51395
   macro avg       0.82      0.82      0.81     51395
weighted avg       0.96      0.96      0.96     51395



### DNN

In [54]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, classification_report

# Define a simple DNN model
def create_dnn_model(input_dim, num_classes):
    model = keras.Sequential([
        layers.InputLayer(input_shape=(input_dim,)),  # Corrected input layer definition
        layers.Dense(128, activation='relu'),  # Hidden layer with ReLU activation
        layers.Dropout(0.2),  # Dropout for regularization
        layers.Dense(64, activation='relu'),  # Another hidden layer
        layers.Dropout(0.2),  # Dropout for regularization
        layers.Dense(num_classes, activation='softmax')  # Output layer with softmax for multi-class classification
    ])
    
    model.compile(optimizer='adam', 
                  loss='sparse_categorical_crossentropy',  # Use sparse categorical crossentropy for integer labels
                  metrics=['accuracy'])
    
    return model

# Create the DNN model
input_dim = X_train.shape[1]  # Number of features
num_classes = len(y.unique())  # Number of output classes (S_E, S_B, S_M)

dnn_model = create_dnn_model(input_dim, num_classes)

# Train the DNN model
dnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on the validation set
val_loss, val_accuracy = dnn_model.evaluate(X_val, y_val, verbose=0)
print(f'Validation Accuracy: {val_accuracy:.4f}')

# Make predictions on the validation set
y_val_pred = dnn_model.predict(X_val)
y_val_pred = y_val_pred.argmax(axis=1)  # Get the predicted class labels

# Evaluate the model
print("Classification Report (Validation):\n", classification_report(y_val, y_val_pred))

# Make predictions on the test set (optional)
y_test_pred = dnn_model.predict(X_test)
y_test_pred = y_test_pred.argmax(axis=1)

# Evaluate the model on the test set
test_loss, test_accuracy = dnn_model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {test_accuracy:.4f}')
print("Classification Report (Test):\n", classification_report(y_test, y_test_pred))





Epoch 1/10


I0000 00:00:1733944337.506510     137 service.cc:145] XLA service 0x7bfa84008360 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733944337.506557     137 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1733944337.506561     137 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m 114/5099[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 1ms/step - accuracy: 0.9263 - loss: 0.4810

I0000 00:00:1733944341.016759     137 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m5099/5099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.9447 - loss: 0.1532 - val_accuracy: 0.9519 - val_loss: 0.1114
Epoch 2/10
[1m5099/5099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9540 - loss: 0.1064 - val_accuracy: 0.9570 - val_loss: 0.1035
Epoch 3/10
[1m5099/5099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9555 - loss: 0.1031 - val_accuracy: 0.9576 - val_loss: 0.1018
Epoch 4/10
[1m5099/5099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9569 - loss: 0.1006 - val_accuracy: 0.9588 - val_loss: 0.1008
Epoch 5/10
[1m5099/5099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9577 - loss: 0.0985 - val_accuracy: 0.9591 - val_loss: 0.0996
Epoch 6/10
[1m5099/5099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9589 - loss: 0.0949 - val_accuracy: 0.9582 - val_loss: 0.1004
Epoch 7/10
[1m5099/5099[