### Other Methods -1 (with 2000 max features)


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score

In [4]:
# Read the CSV file
data = pd.read_csv('/kaggle/input/sbd-data/dataset.csv')

data.head()

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,1,اس,یہ,DET,DEM,2,det,0,2
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,3,کی,کا,ADP,PSP,2,case,9,11
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22


In [5]:
data = data.drop(columns=['id'])

In [6]:
import string
import re

In [7]:
# Initialize a new column 'y' with the default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Convert 'y' column to categorical type (optional, for ML efficiency)
data['y'] = data['y'].astype('category')

# Map categorical labels to numeric values
label_mapping = {'S_B': 1, 'S_M': 0}
data['y'] = data['y'].map(label_mapping)

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
7,۔,۔,PUNCT,SYM,7,punct,33,34,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0


In [8]:
# Drop rows where the 'text' column contains only punctuation
data = data[~data['text'].str.contains(r'^[^\w\s]+$', na=False)]

# Verify the result
data.head(10)


Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0
10,ممکن,ممکن,ADJ,JJ,0,root,44,48,0


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [10]:
# One-hot encode 'upos', 'xpos', and 'deprel'
encoder = OneHotEncoder(sparse_output=False)
encoded_cats = encoder.fit_transform(data[['upos', 'xpos', 'deprel']])

# Convert to DataFrame for easier merging
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out())

# Concatenate encoded features back to the dataset
data = pd.concat([data.reset_index(drop=True), encoded_cats_df], axis=1)

# Drop the original categorical columns (optional)
data = data.drop(columns=['upos', 'xpos', 'deprel'])

In [11]:
# Select the numerical features to normalize
numerical_features = ['start_char', 'end_char', 'head']

from sklearn.preprocessing import MinMaxScaler

# Option 2: Min-Max Scaling (scales features to a range, typically 0 to 1)
min_max_scaler = MinMaxScaler()
data[numerical_features] = min_max_scaler.fit_transform(data[numerical_features])

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=2000)  # Adjust max_features as needed

# Fit and transform only the text column
tfidf_features = tfidf_vectorizer.fit_transform(data['text'])

# Convert the sparse matrix to a DataFrame for better integration
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Add the TF-IDF features back to the original DataFrame
data = pd.concat([data.reset_index(drop=True), tfidf_df], axis=1)

# Drop the original text and lemma columns 
data = data.drop(columns=['text', 'lemma'])

In [13]:
data.head()

Unnamed: 0,head,start_char,end_char,y,upos_ADJ,upos_ADP,upos_ADV,upos_AUX,upos_CCONJ,upos_DET,...,یقینی,یونانی,یونیورسٹی,یوں,یک,یکسانیت,یہ,یہاں,یہی,یہیں
0,0.014493,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.036232,3e-06,5e-06,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.014493,8e-06,8e-06,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.036232,1.1e-05,1.2e-05,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.050725,1.5e-05,1.8e-05,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Define the feature matrix (drop 'y') and target
X = data.drop(columns=['y'])
y = data['y']


In [15]:
# Split into training (64%), validation (16%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.36, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.56, random_state=42, stratify=y_temp)

### XGB

In [14]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

In [15]:
# Initialize XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Fit the model
xgb_model.fit(X_train, y_train)

# Predict on validation and test sets
y_train_pred_xgb = xgb_model.predict(X_train)
y_val_pred_xgb = xgb_model.predict(X_val)
y_test_pred_xgb = xgb_model.predict(X_test)

# Evaluate XGBoost
print("XGBoost - Train Set:")
print(classification_report(y_train, y_train_pred_xgb))
print("XGBoost - Validation Set:")
print(classification_report(y_val, y_val_pred_xgb))
print("XGBoost - Test Set:")
print(classification_report(y_test, y_test_pred_xgb))

XGBoost - Train Set:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    148335
           1       0.82      0.57      0.68      5768

    accuracy                           0.98    154103
   macro avg       0.90      0.78      0.83    154103
weighted avg       0.98      0.98      0.98    154103

XGBoost - Validation Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     36713
           1       0.79      0.56      0.65      1427

    accuracy                           0.98     38140
   macro avg       0.89      0.78      0.82     38140
weighted avg       0.98      0.98      0.98     38140

XGBoost - Test Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     46727
           1       0.79      0.55      0.65      1817

    accuracy                           0.98     48544
   macro avg       0.89      0.77      0.82     48544
weighte

### Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from sklearn import tree

# Initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)  
# Fit the model on the training set
dt_model.fit(X_train, y_train)

# Predict on the validation and test sets
#also predict on the training set to see how well the model is doing
y_train_pred = dt_model.predict(X_train)
y_val_pred = dt_model.predict(X_val)
y_test_pred = dt_model.predict(X_test)

# Evaluate the model
print("Decision Tree - Train Set:")
print(classification_report(y_train, y_train_pred))
print("Decision Tree - Validation Set:")
print(classification_report(y_val, y_val_pred))

print("Decision Tree - Test Set:")
print(classification_report(y_test, y_test_pred))

Decision Tree - Train Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99    148335
           1       0.80      0.57      0.67      5768

    accuracy                           0.98    154103
   macro avg       0.89      0.78      0.83    154103
weighted avg       0.98      0.98      0.98    154103

Decision Tree - Validation Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     36713
           1       0.76      0.53      0.62      1427

    accuracy                           0.98     38140
   macro avg       0.87      0.76      0.80     38140
weighted avg       0.97      0.98      0.97     38140

Decision Tree - Test Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     46727
           1       0.76      0.55      0.63      1817

    accuracy                           0.98     48544
   macro avg       0.87      0.77      0.81

### RandomForest

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf_model.fit(X_train, y_train)

# Predict on validation and test sets
# also predict on the training set to see how well the model is doing
y_train_pred_rf = rf_model.predict(X_train)
y_val_pred_rf = rf_model.predict(X_val)
y_test_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest
print("Random Forest - Train Set:")
print(classification_report(y_train, y_train_pred_rf))
print("Random Forest - Validation Set:")
print(classification_report(y_val, y_val_pred_rf))
print("Random Forest - Test Set:")
print(classification_report(y_test, y_test_pred_rf))

Random Forest - Train Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    148335
           1       1.00      1.00      1.00      5768

    accuracy                           1.00    154103
   macro avg       1.00      1.00      1.00    154103
weighted avg       1.00      1.00      1.00    154103

Random Forest - Validation Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     36713
           1       0.73      0.59      0.65      1427

    accuracy                           0.98     38140
   macro avg       0.86      0.79      0.82     38140
weighted avg       0.97      0.98      0.98     38140

Random Forest - Test Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     46727
           1       0.71      0.59      0.64      1817

    accuracy                           0.98     48544
   macro avg       0.85      0.79      0.82

### Feedforward Network

In [20]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, classification_report

# Define a simple DNN model
def create_dnn_model(input_dim, num_classes):
    model = keras.Sequential([
        layers.InputLayer(input_shape=(input_dim,)),  # Corrected input layer definition
        layers.Dense(128, activation='relu'),  # Hidden layer with ReLU activation
        layers.Dropout(0.2),  # Dropout for regularization
        layers.Dense(64, activation='relu'),  # Another hidden layer
        layers.Dropout(0.2),  # Dropout for regularization
        layers.Dense(num_classes, activation='softmax')  # Output layer with softmax for multi-class classification
    ])
    
    model.compile(optimizer='adam', 
                  loss='sparse_categorical_crossentropy',  # Use sparse categorical crossentropy for integer labels
                  metrics=['accuracy'])
    
    return model

# Ensure labels are integer encoded if they are categorical
y_train = y_train.astype('int') if y_train.dtype.name == 'category' else y_train
y_val = y_val.astype('int') if y_val.dtype.name == 'category' else y_val
y_test = y_test.astype('int') if y_test.dtype.name == 'category' else y_test

# Create the DNN model
input_dim = X_train.shape[1]  # Number of features
num_classes = len(y_train.unique())  # Number of output classes ( S_B, S_M)

dnn_model = create_dnn_model(input_dim, num_classes)

# Train the DNN model
dnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on the validation set
val_loss, val_accuracy = dnn_model.evaluate(X_val, y_val, verbose=0)
print(f'Validation Accuracy: {val_accuracy:.4f}')

# Make predictions on the validation set
y_val_pred = dnn_model.predict(X_val)
y_val_pred = y_val_pred.argmax(axis=1)  # Get the predicted class labels

# Evaluate the model on the validation set
print("Classification Report (Validation):\n", classification_report(y_val, y_val_pred))

# Make predictions on the test set (optional)
y_test_pred = dnn_model.predict(X_test)
y_test_pred = y_test_pred.argmax(axis=1)

# Evaluate the model on the test set
test_loss, test_accuracy = dnn_model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {test_accuracy:.4f}')
print("Classification Report (Test):\n", classification_report(y_test, y_test_pred))





Epoch 1/10


I0000 00:00:1735764615.750428     103 service.cc:145] XLA service 0x7985e00043e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735764615.750472     103 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1735764615.750477     103 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m 111/4816[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 1ms/step - accuracy: 0.9014 - loss: 0.3636

I0000 00:00:1735764619.116392     103 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m4816/4816[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - accuracy: 0.9610 - loss: 0.1203 - val_accuracy: 0.9688 - val_loss: 0.0821
Epoch 2/10
[1m4816/4816[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9679 - loss: 0.0816 - val_accuracy: 0.9683 - val_loss: 0.0784
Epoch 3/10
[1m4816/4816[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9722 - loss: 0.0718 - val_accuracy: 0.9728 - val_loss: 0.0718
Epoch 4/10
[1m4816/4816[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9739 - loss: 0.0667 - val_accuracy: 0.9741 - val_loss: 0.0684
Epoch 5/10
[1m4816/4816[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.9759 - loss: 0.0619 - val_accuracy: 0.9744 - val_loss: 0.0661
Epoch 6/10
[1m4816/4816[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9765 - loss: 0.0591 - val_accuracy: 0.9757 - val_loss: 0.0635
Epoch 7/10
[1m4816/4816[

### Logistic Regression

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

In [2]:
# Apply SMOTE to the training data
#smote = SMOTE(random_state=42)
#X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

NameError: name 'X_train' is not defined

In [18]:
# Train the logistic regression model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_train_pred = logistic_model.predict(X_train)
y_val_pred = logistic_model.predict(X_val)
y_test_pred = logistic_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
print("Logistic Regression - Training Set:")
print(classification_report(y_train, y_train_pred))
print("Logistic Regression - Validation Set:")
print(classification_report(y_val, y_val_pred))
print("Logistic Regression - test Set:")
print(classification_report(y_test, y_test_pred))

Logistic Regression - Training Set:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98    148335
           1       0.76      0.29      0.42      5768

    accuracy                           0.97    154103
   macro avg       0.86      0.64      0.70    154103
weighted avg       0.96      0.97      0.96    154103

Logistic Regression - Validation Set:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     36713
           1       0.72      0.27      0.39      1427

    accuracy                           0.97     38140
   macro avg       0.84      0.63      0.69     38140
weighted avg       0.96      0.97      0.96     38140

Logistic Regression - test Set:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     46727
           1       0.70      0.27      0.39      1817

    accuracy                           0.97     48544
   macro avg       0.8