In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report

In [2]:
# Read the CSV file
data = pd.read_csv('dataset.csv')

data.head()

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,1,اس,یہ,DET,DEM,2,det,0,2
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,3,کی,کا,ADP,PSP,2,case,9,11
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22


In [3]:
data = data.drop(columns=['id'])

In [4]:
import string
import re

In [5]:
# Initialize a new column 'y' with the default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Convert 'y' column to categorical type (optional, for ML efficiency)
data['y'] = data['y'].astype('category')

# Map categorical labels to numeric values
label_mapping = {'S_B': 1, 'S_M': 0}
data['y'] = data['y'].map(label_mapping)

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
7,۔,۔,PUNCT,SYM,7,punct,33,34,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0


In [6]:
# Drop rows where the 'text' column contains only punctuation
data = data[~data['text'].str.contains(r'^[^\w\s]+$', na=False)]

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0
10,ممکن,ممکن,ADJ,JJ,0,root,44,48,0


In [7]:
data = data.reset_index(drop=True)

In [8]:
# Split the data before feature extraction
X_train, X_test, y_train, y_test = train_test_split(data, data['y'], test_size=0.2, random_state=42)


In [9]:
# Function to extract features for CRF
def extract_features(row):
    return {
        "word.lower()": row["text"].lower(),
        "word.isupper()": row["text"].isupper(),
        "word.istitle()": row["text"].istitle(),
        "upos": row["upos"],
        "xpos": row["xpos"],
    }


In [10]:
# Prepare data for CRF
sentences = []
labels = []
current_sentence = []
current_labels = []
for _, row in data.iterrows():
    current_sentence.append(extract_features(row))
    current_labels.append(row["y"])
    # Assume end of sentence based on punctuation
    if row["y"] == 1:
        sentences.append(current_sentence)
        labels.append(current_labels)
        current_sentence = []
        current_labels = []

# Add any remaining tokens
if current_sentence:
    sentences.append(current_sentence)
    labels.append(current_labels)

In [11]:
# Extract features for training and test sets
X_train_features, y_train_features = prepare_data(X_train)
X_test_features, y_test_features = prepare_data(X_test)


NameError: name 'prepare_data' is not defined

In [17]:
# Convert labels to strings
y_train_features = [[str(label) for label in seq] for seq in y_train_features]
y_test_features = [[str(label) for label in seq] for seq in y_test_features]

In [18]:
# Train CRF model
crf = CRF(algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(X_train_features, y_train_features)

In [21]:
# Predict on the test set
y_pred = crf.predict(X_test_features)

# Evaluate the model
print(flat_classification_report(y_test_features, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     47258
           1       1.00      1.00      1.00      1803

    accuracy                           1.00     49061
   macro avg       1.00      1.00      1.00     49061
weighted avg       1.00      1.00      1.00     49061



In [22]:
from collections import Counter

# Get feature weights
weights = Counter(crf.transition_features_)
print("Top positive transitions:")
print(weights.most_common(10))

print("\nTop negative transitions:")
print(weights.most_common()[-10:])


Top positive transitions:
[(('0', '1'), 10.18674), (('0', '0'), 5.411563), (('1', '1'), -2.983821), (('1', '0'), -12.578055)]

Top negative transitions:
[(('0', '1'), 10.18674), (('0', '0'), 5.411563), (('1', '1'), -2.983821), (('1', '0'), -12.578055)]
