### Approach 5: (without puntuation, with added features)

Decision Tree, XGB , Random forest, logistic regression

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Read the CSV file
data = pd.read_csv('/kaggle/input/sbd-data/dataset.csv')

data.head()

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,1,اس,یہ,DET,DEM,2,det,0,2
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,3,کی,کا,ADP,PSP,2,case,9,11
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22


In [3]:
data = data.drop(columns=['id'])

In [4]:
import string
import re

In [5]:
# Initialize a new column 'y' with the default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Convert 'y' column to categorical type (optional, for ML efficiency)
data['y'] = data['y'].astype('category')

# Map categorical labels to numeric values
label_mapping = {'S_B': 1, 'S_M': 0}
data['y'] = data['y'].map(label_mapping)

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
7,۔,۔,PUNCT,SYM,7,punct,33,34,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0


In [6]:
# Drop rows where the 'text' column contains only punctuation
data = data[~data['text'].str.contains(r'^[^\w\s]+$', na=False)]

# Verify the result
data.head(10)


Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0
10,ممکن,ممکن,ADJ,JJ,0,root,44,48,0


In [7]:
from sklearn.model_selection import train_test_split

# Define the feature matrix (X) and target variable (y)
X = data.drop(columns=['y'])
y = data['y']

# Split the data into training (64%), validation (16%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.36, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.56, random_state=42, stratify=y_temp)

# Verify split
print("Training Set Size:", len(X_train))
print("Validation Set Size:", len(X_val))
print("Test Set Size:", len(X_test))

Training Set Size: 154103
Validation Set Size: 38140
Test Set Size: 48544


In [8]:
# Define a function to add previous and next word features
def add_context_features(data):
    # Previous word features
    data['prev_text'] = data['text'].shift(1)
    data['prev_lemma'] = data['lemma'].shift(1)
    data['prev_upos'] = data['upos'].shift(1)
    data['prev_xpos'] = data['xpos'].shift(1)
    data['prev_head'] = data['head'].shift(1)
    data['prev_deprel'] = data['deprel'].shift(1)
    data['prev_start_char'] = data['start_char'].shift(1)
    data['prev_end_char'] = data['end_char'].shift(1)
    
    # Next word features
    data['next_text'] = data['text'].shift(-1)
    data['next_lemma'] = data['lemma'].shift(-1)
    data['next_upos'] = data['upos'].shift(-1)
    data['next_xpos'] = data['xpos'].shift(-1)
    data['next_head'] = data['head'].shift(-1)
    data['next_deprel'] = data['deprel'].shift(-1)
    data['next_start_char'] = data['start_char'].shift(-1)
    data['next_end_char'] = data['end_char'].shift(-1)
    
    # Fill NaN values for edge cases
    data.fillna(method='bfill', axis=0, inplace=True)  # Backfill for next words
    data.fillna(method='ffill', axis=0, inplace=True)  # Forward fill for previous words
    
    return data

# Apply the function separately to each dataset
X_train = add_context_features(X_train)
X_val = add_context_features(X_val)
X_test = add_context_features(X_test)

# Verify by checking the first few rows of one dataset
X_train.head()

  data.fillna(method='bfill', axis=0, inplace=True)  # Backfill for next words
  data.fillna(method='ffill', axis=0, inplace=True)  # Forward fill for previous words
  data.fillna(method='bfill', axis=0, inplace=True)  # Backfill for next words
  data.fillna(method='ffill', axis=0, inplace=True)  # Forward fill for previous words
  data.fillna(method='bfill', axis=0, inplace=True)  # Backfill for next words
  data.fillna(method='ffill', axis=0, inplace=True)  # Forward fill for previous words


Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,prev_text,prev_lemma,...,prev_start_char,prev_end_char,next_text,next_lemma,next_upos,next_xpos,next_head,next_deprel,next_start_char,next_end_char
15824,آپ,آپ,PRON,PRP,40,nsubj,71264,71266,آپ,آپ,...,71264.0,71266.0,کی,کا,ADP,PSP,1.0,case,683744.0,683746.0
154359,کی,کا,ADP,PSP,1,case,683744,683746,آپ,آپ,...,71264.0,71266.0,ٹوئسٹ,ٹوئسٹ,NOUN,NN,33.0,obl,685463.0,685468.0
154738,ٹوئسٹ,ٹوئسٹ,NOUN,NN,33,obl,685463,685468,کی,کا,...,683744.0,683746.0,جدید,جدید,ADJ,JJ,16.0,amod,680277.0,680281.0
153576,جدید,جدید,ADJ,JJ,16,amod,680277,680281,ٹوئسٹ,ٹوئسٹ,...,685463.0,685468.0,تھی,تھا,AUX,VAUX,18.0,aux,27346.0,27349.0
6068,تھی,تھا,AUX,VAUX,18,aux,27346,27349,جدید,جدید,...,680277.0,680281.0,گھر,گھر,NOUN,NN,28.0,nmod,1115470.0,1115473.0


In [9]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize OneHotEncoder with 'handle_unknown="ignore"'
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
min_max_scaler = MinMaxScaler()

# 1. One-Hot Encoding for 'upos', 'xpos', 'deprel' (including for next and previous words)
encoded_cats_train = encoder.fit_transform(X_train[['prev_upos', 'prev_xpos', 'prev_deprel', 
                                                   'upos', 'xpos', 'deprel', 
                                                   'next_upos', 'next_xpos', 'next_deprel']])

encoded_cats_val = encoder.transform(X_val[['prev_upos', 'prev_xpos', 'prev_deprel', 
                                            'upos', 'xpos', 'deprel', 
                                            'next_upos', 'next_xpos', 'next_deprel']])

encoded_cats_test = encoder.transform(X_test[['prev_upos', 'prev_xpos', 'prev_deprel', 
                                              'upos', 'xpos', 'deprel', 
                                              'next_upos', 'next_xpos', 'next_deprel']])

# Convert to DataFrame
encoded_cats_train_df = pd.DataFrame(encoded_cats_train, columns=encoder.get_feature_names_out())
encoded_cats_val_df = pd.DataFrame(encoded_cats_val, columns=encoder.get_feature_names_out())
encoded_cats_test_df = pd.DataFrame(encoded_cats_test, columns=encoder.get_feature_names_out())

# Concatenate the encoded features back to the original datasets
X_train = pd.concat([X_train.reset_index(drop=True), encoded_cats_train_df], axis=1)
X_val = pd.concat([X_val.reset_index(drop=True), encoded_cats_val_df], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), encoded_cats_test_df], axis=1)

# Drop original categorical columns
X_train = X_train.drop(columns=['prev_upos', 'prev_xpos', 'prev_deprel', 'upos', 'xpos', 'deprel', 
                                'next_upos', 'next_xpos', 'next_deprel'])
X_val = X_val.drop(columns=['prev_upos', 'prev_xpos', 'prev_deprel', 'upos', 'xpos', 'deprel', 
                            'next_upos', 'next_xpos', 'next_deprel'])
X_test = X_test.drop(columns=['prev_upos', 'prev_xpos', 'prev_deprel', 'upos', 'xpos', 'deprel', 
                              'next_upos', 'next_xpos', 'next_deprel'])



In [10]:
# 2. Min-Max Scaling for numerical features
numerical_features = ['start_char', 'end_char', 'head', 
                      'prev_start_char', 'prev_end_char', 
                      'next_start_char', 'next_end_char']

X_train[numerical_features] = min_max_scaler.fit_transform(X_train[numerical_features])
X_val[numerical_features] = min_max_scaler.transform(X_val[numerical_features])
X_test[numerical_features] = min_max_scaler.transform(X_test[numerical_features])

# 3. TF-IDF Vectorization for text and lemma features
X_train['text_lemma_prev_next'] = (X_train['prev_text'] + " " + X_train['prev_lemma'] + " " +
                                   X_train['text'] + " " + X_train['lemma'] + " " +
                                   X_train['next_text'] + " " + X_train['next_lemma'])

X_val['text_lemma_prev_next'] = (X_val['prev_text'] + " " + X_val['prev_lemma'] + " " +
                                 X_val['text'] + " " + X_val['lemma'] + " " +
                                 X_val['next_text'] + " " + X_val['next_lemma'])

X_test['text_lemma_prev_next'] = (X_test['prev_text'] + " " + X_test['prev_lemma'] + " " +
                                  X_test['text'] + " " + X_test['lemma'] + " " +
                                  X_test['next_text'] + " " + X_test['next_lemma'])

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=500)

# Fit and transform on training set
tfidf_train = tfidf_vectorizer.fit_transform(X_train['text_lemma_prev_next'])
tfidf_val = tfidf_vectorizer.transform(X_val['text_lemma_prev_next'])
tfidf_test = tfidf_vectorizer.transform(X_test['text_lemma_prev_next'])

# Convert sparse matrices to DataFrames
tfidf_train_df = pd.DataFrame(tfidf_train.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_val_df = pd.DataFrame(tfidf_val.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_test_df = pd.DataFrame(tfidf_test.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate TF-IDF features back to the datasets
X_train = pd.concat([X_train.reset_index(drop=True), tfidf_train_df], axis=1)
X_val = pd.concat([X_val.reset_index(drop=True), tfidf_val_df], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), tfidf_test_df], axis=1)

# Drop original text and lemma columns (optional)
X_train = X_train.drop(columns=['text', 'lemma', 'text_lemma_prev_next', 'prev_text', 'prev_lemma', 'next_text', 'next_lemma'])
X_val = X_val.drop(columns=['text', 'lemma', 'text_lemma_prev_next', 'prev_text', 'prev_lemma', 'next_text', 'next_lemma'])
X_test = X_test.drop(columns=['text', 'lemma', 'text_lemma_prev_next', 'prev_text', 'prev_lemma', 'next_text', 'next_lemma'])


In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)

# Train the model on the training set
dt_model.fit(X_train, y_train)

# Make predictions on validation and test sets
y_val_pred = dt_model.predict(X_val)
y_test_pred = dt_model.predict(X_test)

# Evaluate the model
print("Decision Tree - Validation Set:")
print(classification_report(y_val, y_val_pred))

print("Decision Tree - Test Set:")
print(classification_report(y_test, y_test_pred))


Decision Tree - Validation Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     36713
           1       0.75      0.53      0.62      1427

    accuracy                           0.98     38140
   macro avg       0.87      0.76      0.80     38140
weighted avg       0.97      0.98      0.97     38140

Decision Tree - Test Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     46727
           1       0.74      0.53      0.62      1817

    accuracy                           0.98     48544
   macro avg       0.86      0.76      0.80     48544
weighted avg       0.97      0.98      0.97     48544



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf_model.fit(X_train, y_train)

# Predict on validation and test sets
y_val_pred_rf = rf_model.predict(X_val)
y_test_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest
print("Random Forest - Validation Set:")
print(classification_report(y_val, y_val_pred_rf))
print("Random Forest - Test Set:")
print(classification_report(y_test, y_test_pred_rf))

Random Forest - Validation Set:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     36713
           1       0.83      0.38      0.52      1427

    accuracy                           0.97     38140
   macro avg       0.90      0.69      0.75     38140
weighted avg       0.97      0.97      0.97     38140

Random Forest - Test Set:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     46727
           1       0.83      0.39      0.53      1817

    accuracy                           0.97     48544
   macro avg       0.90      0.69      0.76     48544
weighted avg       0.97      0.97      0.97     48544



In [13]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
# Initialize XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Fit the model
xgb_model.fit(X_train, y_train)

# Predict on validation and test sets
y_val_pred_xgb = xgb_model.predict(X_val)
y_test_pred_xgb = xgb_model.predict(X_test)

# Evaluate XGBoost
print("XGBoost - Validation Set:")
print(classification_report(y_val, y_val_pred_xgb))
print("XGBoost - Test Set:")
print(classification_report(y_test, y_test_pred_xgb))

XGBoost - Validation Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     36713
           1       0.78      0.51      0.62      1427

    accuracy                           0.98     38140
   macro avg       0.88      0.75      0.80     38140
weighted avg       0.97      0.98      0.97     38140

XGBoost - Test Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     46727
           1       0.77      0.52      0.62      1817

    accuracy                           0.98     48544
   macro avg       0.88      0.76      0.80     48544
weighted avg       0.97      0.98      0.97     48544



In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [17]:
# Train the logistic regression model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate the model
y_train_pred = logistic_model.predict(X_train_resampled)
y_val_pred = logistic_model.predict(X_val)
y_test_pred = logistic_model.predict(X_test)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Evaluate the Logistic Regression model on all datasets
print("Logistic Regression - Training Set:")
print(classification_report(y_train_resampled, y_train_pred))

print("Logistic Regression - Validation Set:")
print(classification_report(y_val, y_val_pred))

print("Logistic Regression - Test Set:")
print(classification_report(y_test, y_test_pred))


Logistic Regression - Training Set:
              precision    recall  f1-score   support

           0       0.91      0.81      0.86    148335
           1       0.83      0.92      0.88    148335

    accuracy                           0.87    296670
   macro avg       0.87      0.87      0.87    296670
weighted avg       0.87      0.87      0.87    296670

Logistic Regression - Validation Set:
              precision    recall  f1-score   support

           0       1.00      0.81      0.90     36713
           1       0.16      0.91      0.27      1427

    accuracy                           0.82     38140
   macro avg       0.58      0.86      0.58     38140
weighted avg       0.96      0.82      0.87     38140

Logistic Regression - Test Set:
              precision    recall  f1-score   support

           0       0.99      0.81      0.90     46727
           1       0.16      0.88      0.26      1817

    accuracy                           0.82     48544
   macro avg       0.5