### Approch CRF -2
#### since i got a really accurate score with CRF approach now trying to look for potential data leaks.
#### In this method: No label was used for sentence splitting, Each sentence is kept intact, Train/Val/Test sets do not share sentence boundaries.





In [32]:
import pandas as pd
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
import joblib

In [33]:
import pandas as pd
import re

# Load the data
df = pd.read_csv('dataset.csv')

# Drop unwanted columns (like 'id' if it exists)
df = df.drop(columns=[col for col in df.columns if col.lower() in ['id', 'Unnamed: 0']], errors='ignore')



In [34]:
df.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,اس,یہ,DET,DEM,2,det,0,2
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,کی,کا,ADP,PSP,2,case,9,11
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22
5,یہاں,یہاں,PRON,PRP,7,obl,23,27
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33
7,۔,۔,PUNCT,SYM,7,punct,33,34
8,یہ,یہ,PRON,PRP,3,nsubj,36,38
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43


In [35]:
# step 1: Preprocess the data
# Split into Sentences Using Punctuation (۔)
sentences = []
current_sentence = []

for i, row in df.iterrows():
    current_sentence.append(row)
    
    if str(row['text']).strip() == '۔':
        sentences.append(pd.DataFrame(current_sentence))
        current_sentence = []

# Append last sentence if it didn't end in punctuation
if current_sentence:
    sentences.append(pd.DataFrame(current_sentence))


In [36]:
# display the first 10 sentences
for sentence in sentences[:10]:
    print(sentence.head(10))
    print("-----")

    text  lemma   upos xpos  head deprel  start_char  end_char
0     اس     یہ    DET  DEM     2    det           0         2
1  سلسلے  سلسلہ   NOUN   NN     5   nmod           3         8
2     کی     کا    ADP  PSP     2   case           9        11
3   دیگر   دیگر    ADJ   JJ     5   amod          12        16
4  اقساط  اقساط   NOUN   NN     7  nsubj          17        22
5   یہاں   یہاں   PRON  PRP     7    obl          23        27
6  پڑھیے    پڑھ   VERB   VM     0   root          28        33
7      ۔      ۔  PUNCT  SYM     7  punct          33        34
-----
       text    lemma   upos xpos  head  deprel  start_char  end_char
8        یہ       یہ   PRON  PRP     3   nsubj          36        38
9      کیسے     کیسا   PRON   WQ     3  advmod          39        43
10     ممکن     ممکن    ADJ   JJ     0    root          44        48
11       ہے       ہے    AUX   VM     3     cop          49        51
12       کہ       کہ  SCONJ   CC    12    mark          52        54
13      کسی  

In [37]:
# Label the Sentences with S_B and S_M
def label_sentence(df_sentence):
    df_sentence = df_sentence.copy()
    df_sentence['label'] = ['S_B'] + ['S_M'] * (len(df_sentence) - 1)
    return df_sentence

labeled_sentences = [label_sentence(sent) for sent in sentences]


In [38]:
# Split Sentences into Train / Val / Test
from sklearn.model_selection import train_test_split

# First split train vs temp
train_sents, temp_sents = train_test_split(labeled_sentences, test_size=0.36, random_state=42)

# Then split temp into validation and test
val_sents, test_sents = train_test_split(temp_sents, test_size=0.56, random_state=42)


In [39]:
# Combine Sentence Lists into DataFrames
train_df = pd.concat(train_sents).reset_index(drop=True)
val_df   = pd.concat(val_sents).reset_index(drop=True)
test_df  = pd.concat(test_sents).reset_index(drop=True)


In [40]:
# Rename 'label' to 'y' for consistency
train_df['y'] = train_df['label']
val_df['y'] = val_df['label']
test_df['y'] = test_df['label']

# Drop any unnecessary columns if needed
X_train, y_train = train_df.drop(columns=['y']), train_df['y']
X_val, y_val = val_df.drop(columns=['y']), val_df['y']
X_test, y_test = test_df.drop(columns=['y']), test_df['y']


In [41]:
# feature extraction
def word2features(row, idx, df):
    features = {
        'bias': 1.0,
        'word.lower()': row['text'].lower(),
        'word.isdigit()': row['text'].isdigit(),
        'lemma': row['lemma'],
        'upos': row['upos'],
        'xpos': row['xpos'],
        'deprel': row['deprel'],
    }

    # Previous token features
    if idx > 0:
        prev_row = df.iloc[idx - 1]
        features.update({
            '-1:word.lower()': prev_row['text'].lower(),
            '-1:lemma': prev_row['lemma'],
            '-1:upos': prev_row['upos'],
        })
    else:
        features['BOS'] = True 

    # Next token features
    if idx < len(df) - 1:
        next_row = df.iloc[idx + 1]
        features.update({
            '+1:word.lower()': next_row['text'].lower(),
            '+1:lemma': next_row['lemma'],
            '+1:upos': next_row['upos'],
        })
    else:
        features['EOS'] = True  # End of sentence

    return features



In [42]:
def dataframe_to_sequences(X_df, y_df):
    sequences_X, sequences_y = [], []
    current_X, current_y = [], []

    for i, (idx, row) in enumerate(X_df.iterrows()):
        features = word2features(row, i, X_df)
        current_X.append(features)
        current_y.append(y_df.iloc[i])

        # Detect end of sentence (safe: when next row is BOS)
        if (i + 1 == len(X_df)) or (X_df.iloc[i + 1]['text'] == '۔'):
            sequences_X.append(current_X)
            sequences_y.append(current_y)
            current_X, current_y = [], []

    return sequences_X, sequences_y


# Prepare sequences for train/val/test
X_train_seq, y_train_seq = dataframe_to_sequences(train_df.drop(columns='y'), train_df['y'])
X_val_seq, y_val_seq = dataframe_to_sequences(val_df.drop(columns='y'), val_df['y'])
X_test_seq, y_test_seq = dataframe_to_sequences(test_df.drop(columns='y'), test_df['y'])

In [43]:
crf = CRF(algorithm='lbfgs', max_iterations=100)
crf.fit(X_train_seq, y_train_seq)


In [44]:
# Evaluation
y_pred_val = crf.predict(X_val_seq)
print("Validation Results:\n", flat_classification_report(y_val_seq, y_pred_val))

y_pred_test = crf.predict(X_test_seq)
print("Test Results:\n", flat_classification_report(y_test_seq, y_pred_test))

Validation Results:
               precision    recall  f1-score   support

         S_B       1.00      1.00      1.00      1440
         S_M       1.00      1.00      1.00     39517

    accuracy                           1.00     40957
   macro avg       1.00      1.00      1.00     40957
weighted avg       1.00      1.00      1.00     40957

Test Results:
               precision    recall  f1-score   support

         S_B       1.00      1.00      1.00      1834
         S_M       1.00      1.00      1.00     49007

    accuracy                           1.00     50841
   macro avg       1.00      1.00      1.00     50841
weighted avg       1.00      1.00      1.00     50841

