In [36]:
import pandas as pd

In [37]:
# Read the CSV file
df = pd.read_csv('dataset.csv')

df.head()

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,1,اس,یہ,DET,DEM,2,det,0,2
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,3,کی,کا,ADP,PSP,2,case,9,11
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22


In [38]:
data = df.drop(columns=['id'])

In [39]:
# Initialize a new column 'y' with the default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Convert 'y' column to categorical type (optional, for ML efficiency)
data['y'] = data['y'].astype('category')

# Map categorical labels to numeric values
label_mapping = {'S_B': 1, 'S_M': 0}
data['y'] = data['y'].map(label_mapping)

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
7,۔,۔,PUNCT,SYM,7,punct,33,34,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0


In [40]:
# Drop rows where the 'text' column contains only punctuation
data = data[~data['text'].str.contains(r'^[^\w\s]+$', na=False)]

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0
10,ممکن,ممکن,ADJ,JJ,0,root,44,48,0


In [41]:
# Previous word features
data['prev_text'] = data['text'].shift(1)
data['prev_lemma'] = data['lemma'].shift(1)
data['prev_upos'] = data['upos'].shift(1)
data['prev_xpos'] = data['xpos'].shift(1)
data['prev_head'] = data['head'].shift(1)
data['prev_deprel'] = data['deprel'].shift(1)
data['prev_start_char'] = data['start_char'].shift(1)
data['prev_end_char'] = data['end_char'].shift(1)

# Next word features
data['next_text'] = data['text'].shift(-1)
data['next_lemma'] = data['lemma'].shift(-1)
data['next_upos'] = data['upos'].shift(-1)
data['next_xpos'] = data['xpos'].shift(-1)
data['next_head'] = data['head'].shift(-1)
data['next_deprel'] = data['deprel'].shift(-1)
data['next_start_char'] = data['start_char'].shift(-1)
data['next_end_char'] = data['end_char'].shift(-1)

# Fill NaN for first and last row (because there's no previous or next word)
data.fillna(method='bfill', axis=0, inplace=True)  # Backfill for next words
data.fillna(method='ffill', axis=0, inplace=True)  # Forward fill for previous words

# Display the updated dataframe
data.head()


Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y,prev_text,...,prev_start_char,prev_end_char,next_text,next_lemma,next_upos,next_xpos,next_head,next_deprel,next_start_char,next_end_char
0,اس,یہ,DET,DEM,2,det,0,2,0,اس,...,0.0,2.0,سلسلے,سلسلہ,NOUN,NN,5.0,nmod,3.0,8.0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0,اس,...,0.0,2.0,کی,کا,ADP,PSP,2.0,case,9.0,11.0
2,کی,کا,ADP,PSP,2,case,9,11,0,سلسلے,...,3.0,8.0,دیگر,دیگر,ADJ,JJ,5.0,amod,12.0,16.0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0,کی,...,9.0,11.0,اقساط,اقساط,NOUN,NN,7.0,nsubj,17.0,22.0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0,دیگر,...,12.0,16.0,یہاں,یہاں,PRON,PRP,7.0,obl,23.0,27.0


In [42]:
# Get the column names of the DataFrame
column_names = data.columns.tolist()

# Display the column names
print(column_names)

['text', 'lemma', 'upos', 'xpos', 'head', 'deprel', 'start_char', 'end_char', 'y', 'prev_text', 'prev_lemma', 'prev_upos', 'prev_xpos', 'prev_head', 'prev_deprel', 'prev_start_char', 'prev_end_char', 'next_text', 'next_lemma', 'next_upos', 'next_xpos', 'next_head', 'next_deprel', 'next_start_char', 'next_end_char']


In [43]:
data.isnull().sum()

text               0
lemma              0
upos               0
xpos               0
head               0
deprel             0
start_char         0
end_char           0
y                  0
prev_text          0
prev_lemma         0
prev_upos          0
prev_xpos          0
prev_head          0
prev_deprel        0
prev_start_char    0
prev_end_char      0
next_text          0
next_lemma         0
next_upos          0
next_xpos          0
next_head          0
next_deprel        0
next_start_char    0
next_end_char      0
dtype: int64

In [44]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [45]:
# One-hot encode 'upos', 'xpos', and 'deprel' (including for next and previous words)
encoder = OneHotEncoder(sparse_output=False)


# Including next and previous word features in the encoding
encoded_cats = encoder.fit_transform(data[['prev_upos', 'prev_xpos', 'prev_deprel', 
                                          'upos', 'xpos', 'deprel', 
                                          'next_upos', 'next_xpos', 'next_deprel']])

# Convert to DataFrame for easier merging
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out())

# Concatenate encoded features back to the dataset
data = pd.concat([data.reset_index(drop=True), encoded_cats_df], axis=1)

# Drop the original categorical columns (optional)
data = data.drop(columns=['prev_upos', 'prev_xpos', 'prev_deprel', 
                          'upos', 'xpos', 'deprel', 
                          'next_upos', 'next_xpos', 'next_deprel'])

In [46]:
from sklearn.preprocessing import MinMaxScaler
# Select the numerical features to normalize
numerical_features = ['start_char', 'end_char', 'head', 
                      'prev_start_char', 'prev_end_char', 
                      'next_start_char', 'next_end_char']

# Min-Max Scaling (scales features to a range, typically 0 to 1)
min_max_scaler = MinMaxScaler()
data[numerical_features] = min_max_scaler.fit_transform(data[numerical_features])

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Combine text, lemma, and new word-related features (previous and next) into a single string representation
data['text_lemma_prev_next'] = (data['prev_text'] + " " + data['prev_lemma'] + " " +
                                data['text'] + " " + data['lemma'] + " " +
                                data['next_text'] + " " + data['next_lemma'])

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Adjust max_features as needed

# Fit and transform the combined text and lemma along with previous and next word information
tfidf_features = tfidf_vectorizer.fit_transform(data['text_lemma_prev_next'])

# Convert the sparse matrix to a DataFrame for better integration
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Add the TF-IDF features back to the original DataFrame
data = pd.concat([data.reset_index(drop=True), tfidf_df], axis=1)

# Drop the original text, lemma, and combined text_lemma_prev_next columns (optional)
data = data.drop(columns=['text', 'lemma', 'text_lemma_prev_next','prev_text','prev_lemma','next_text','next_lemma'])


In [48]:
# Get the column names of the DataFrame
column_names = data.columns.tolist()

# Display the column names
print(column_names)

['head', 'start_char', 'end_char', 'y', 'prev_head', 'prev_start_char', 'prev_end_char', 'next_head', 'next_start_char', 'next_end_char', 'prev_upos_ADJ', 'prev_upos_ADP', 'prev_upos_ADV', 'prev_upos_AUX', 'prev_upos_CCONJ', 'prev_upos_DET', 'prev_upos_INTJ', 'prev_upos_NOUN', 'prev_upos_NUM', 'prev_upos_PART', 'prev_upos_PRON', 'prev_upos_PROPN', 'prev_upos_PUNCT', 'prev_upos_SCONJ', 'prev_upos_VERB', 'prev_upos_X', 'prev_xpos_CC', 'prev_xpos_CCC', 'prev_xpos_DEM', 'prev_xpos_ECH', 'prev_xpos_INJ', 'prev_xpos_INTF', 'prev_xpos_JJ', 'prev_xpos_JJC', 'prev_xpos_JJZ', 'prev_xpos_NEG', 'prev_xpos_NN', 'prev_xpos_NNC', 'prev_xpos_NNP', 'prev_xpos_NNPC', 'prev_xpos_NNZ', 'prev_xpos_NST', 'prev_xpos_PRP', 'prev_xpos_PRPC', 'prev_xpos_PSP', 'prev_xpos_QC', 'prev_xpos_QCC', 'prev_xpos_QF', 'prev_xpos_QO', 'prev_xpos_RB', 'prev_xpos_RBC', 'prev_xpos_RDP', 'prev_xpos_RP', 'prev_xpos_SYM', 'prev_xpos_UNK', 'prev_xpos_VAUX', 'prev_xpos_VM', 'prev_xpos_WQ', 'prev_deprel_acl', 'prev_deprel_acl:relcl

In [49]:
# Define the feature matrix (drop 'y') and target
X = data.drop(columns=['y'])
y = data['y']

In [50]:
# Split into training (64%), validation (16%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.36, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.56, random_state=42, stratify=y_temp)


In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from sklearn import tree

# Initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)

# Fit the model on the training set
dt_model.fit(X_train, y_train)

# Predict on the validation and test sets
y_val_pred = dt_model.predict(X_val)
y_test_pred = dt_model.predict(X_test)

# Evaluate the model
print("Decision Tree - Validation Set:")
print(classification_report(y_val, y_val_pred))

print("Decision Tree - Test Set:")
print(classification_report(y_test, y_test_pred))

Decision Tree - Validation Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     36713
           1       0.94      0.93      0.93      1427

    accuracy                           1.00     38140
   macro avg       0.97      0.96      0.97     38140
weighted avg       1.00      1.00      1.00     38140

Decision Tree - Test Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     46727
           1       0.92      0.94      0.93      1817

    accuracy                           0.99     48544
   macro avg       0.96      0.97      0.96     48544
weighted avg       0.99      0.99      0.99     48544

