In [30]:
import pandas as pd
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
import joblib

In [31]:
# Read the CSV file
df = pd.read_csv('dataset.csv')

df.head()

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,1,اس,یہ,DET,DEM,2,det,0,2
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,3,کی,کا,ADP,PSP,2,case,9,11
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22


In [32]:
df.drop(columns=['id'])

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,اس,یہ,DET,DEM,2,det,0,2
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,کی,کا,ADP,PSP,2,case,9,11
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22
...,...,...,...,...,...,...,...,...
254923,طرح,طرح,NOUN,NN,58,obl,1130329,1130332
254924,نکالا,نکال,VERB,VM,44,obj,1130333,1130338
254925,جاسکتا,جاسک,AUX,VAUX,58,aux,1130339,1130345
254926,ہے,ہے,AUX,VAUX,58,aux,1130346,1130348


In [33]:
import numpy as np

# Create a new column 'y' with default value 'S_M'
df['y'] = 'S_M'

# Iterate through the rows to assign 'S_E' and 'S_B'
for i in range(len(df) - 1):
    # Check if the current word ends with a full stop
    if df.loc[i, 'text'].endswith('۔'):
        df.loc[i, 'y'] = 'S_E'  # Sentence End
        # Assign 'S_B' to the next word
        if i + 1 < len(df):
            df.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Convert 'y' column to categorical type (optional, for ML efficiency)
df['y'] = df['y'].astype('category')

# Display the first few rows to verify
df.head(10)

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,1,اس,یہ,DET,DEM,2,det,0,2,S_M
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,S_M
2,3,کی,کا,ADP,PSP,2,case,9,11,S_M
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16,S_M
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,S_M
5,6,یہاں,یہاں,PRON,PRP,7,obl,23,27,S_M
6,7,پڑھیے,پڑھ,VERB,VM,0,root,28,33,S_M
7,8,۔,۔,PUNCT,SYM,7,punct,33,34,S_E
8,1,یہ,یہ,PRON,PRP,3,nsubj,36,38,S_B
9,2,کیسے,کیسا,PRON,WQ,3,advmod,39,43,S_M


In [34]:
# Just to be sure, rename label column
df['label'] = df['y'] 

In [37]:
# Drop rows where the 'text' column contains only punctuation
df= df[~df['text'].str.contains(r'^[^\w\s]+$', na=False)]



In [38]:
df.head(10)

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char,y,label
0,1,اس,یہ,DET,DEM,2,det,0,2,S_M,S_M
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,S_M,S_M
2,3,کی,کا,ADP,PSP,2,case,9,11,S_M,S_M
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16,S_M,S_M
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,S_M,S_M
5,6,یہاں,یہاں,PRON,PRP,7,obl,23,27,S_M,S_M
6,7,پڑھیے,پڑھ,VERB,VM,0,root,28,33,S_M,S_M
8,1,یہ,یہ,PRON,PRP,3,nsubj,36,38,S_B,S_B
9,2,کیسے,کیسا,PRON,WQ,3,advmod,39,43,S_M,S_M
10,3,ممکن,ممکن,ADJ,JJ,0,root,44,48,S_M,S_M


In [40]:
from sklearn.model_selection import train_test_split

# Define the feature matrix (X) and target variable (y)
X = df.drop(columns=['y'])
y = df['y']

# Split the data into training (64%), validation (16%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.36, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.56, random_state=42, stratify=y_temp)

# Verify split
print("Training Set Size:", len(X_train))
print("Validation Set Size:", len(X_val))
print("Test Set Size:", len(X_test))

Training Set Size: 154103
Validation Set Size: 38140
Test Set Size: 48544


In [41]:
# STEP 3: Feature Extraction

def word2features(row, i, df):
    features = {
        'word': row['text'],
        'lemma': row['lemma'],
        'upos': row['upos'],
        'xpos': row['xpos'],
        'deprel': row['deprel'],
        'word.lower()': row['text'].lower(),
        'is_digit': row['text'].isdigit(),
        'char_len': len(row['text']),
    }

    if i > 0:
        prev_row = df.iloc[i - 1]
        features.update({
            '-1:word': prev_row['text'],
            '-1:upos': prev_row['upos'],
            '-1:xpos': prev_row['xpos'],
        })
    else:
        features['BOS'] = True

    if i < len(df) - 1:
        next_row = df.iloc[i + 1]
        features.update({
            '+1:word': next_row['text'],
            '+1:upos': next_row['upos'],
            '+1:xpos': next_row['xpos'],
        })
    else:
        features['EOS'] = True

    return features


In [42]:
# convert the DataFrame to Sentence-Like Format for CRF
def dataframe_to_sequences(X_df, y_df):
    sequences_X, sequences_y = [], []
    current_X, current_y = [], []

    for i, (idx, row) in enumerate(X_df.iterrows()):
        features = word2features(row, i, X_df)
        current_X.append(features)
        current_y.append(y_df.iloc[i])

        if y_df.iloc[i] == 'S_B' and len(current_X) > 1:
            sequences_X.append(current_X)
            sequences_y.append(current_y)
            current_X, current_y = [], []

    if current_X:
        sequences_X.append(current_X)
        sequences_y.append(current_y)

    return sequences_X, sequences_y

# Prepare sequences for train/val/test
X_train_seq, y_train_seq = dataframe_to_sequences(X_train, y_train)
X_val_seq, y_val_seq = dataframe_to_sequences(X_val, y_val)
X_test_seq, y_test_seq = dataframe_to_sequences(X_test, y_test)


In [43]:
# Train the CRF
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report

# Initialize and train CRF
crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train_seq, y_train_seq)


In [44]:
#Evaluate on Validation and Test Sets
print("Validation Results:")
y_val_pred = crf.predict(X_val_seq)
print(flat_classification_report(y_val_seq, y_val_pred, labels=['S_B', 'S_M']))

print("\nTest Results:")
y_test_pred = crf.predict(X_test_seq)
print(flat_classification_report(y_test_seq, y_test_pred, labels=['S_B', 'S_M']))



Validation Results:
              precision    recall  f1-score   support

         S_B       1.00      0.96      0.98      1427
         S_M       1.00      1.00      1.00     36713

    accuracy                           1.00     38140
   macro avg       1.00      0.98      0.99     38140
weighted avg       1.00      1.00      1.00     38140


Test Results:
              precision    recall  f1-score   support

         S_B       1.00      0.97      0.98      1817
         S_M       1.00      1.00      1.00     46727

    accuracy                           1.00     48544
   macro avg       1.00      0.98      0.99     48544
weighted avg       1.00      1.00      1.00     48544

