### Approach 3 (without Punctuation) 

In [1]:
import pandas as pd

In [2]:
# Read the CSV file
df = pd.read_csv('dataset.csv')

df.head()

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,1,اس,یہ,DET,DEM,2,det,0,2
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,3,کی,کا,ADP,PSP,2,case,9,11
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22


In [3]:
data = df.drop(columns=['id'])

In [4]:
import string
import re


In [5]:
data.head(30)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,اس,یہ,DET,DEM,2,det,0,2
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,کی,کا,ADP,PSP,2,case,9,11
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22
5,یہاں,یہاں,PRON,PRP,7,obl,23,27
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33
7,۔,۔,PUNCT,SYM,7,punct,33,34
8,یہ,یہ,PRON,PRP,3,nsubj,36,38
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43


In [7]:
# Initialize a new column 'y' with the default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Convert 'y' column to categorical type (optional, for ML efficiency)
data['y'] = data['y'].astype('category')

# Map categorical labels to numeric values
label_mapping = {'S_B': 1, 'S_M': 0}
data['y'] = data['y'].map(label_mapping)

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
7,۔,۔,PUNCT,SYM,7,punct,33,34,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0


In [None]:
# Drop rows where the 'text' column contains only punctuation
data = data[~data['text'].str.contains(r'^[^\w\s]+$', na=False)]

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0
10,ممکن,ممکن,ADJ,JJ,0,root,44,48,0


In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [14]:
# One-hot encode 'upos', 'xpos', and 'deprel'
encoder = OneHotEncoder(sparse_output=False)
encoded_cats = encoder.fit_transform(data[['upos', 'xpos', 'deprel']])

# Convert to DataFrame for easier merging
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out())

# Concatenate encoded features back to the dataset
data = pd.concat([data.reset_index(drop=True), encoded_cats_df], axis=1)

# Drop the original categorical columns (optional)
data = data.drop(columns=['upos', 'xpos', 'deprel'])

In [15]:
# Select the numerical features to normalize
numerical_features = ['start_char', 'end_char', 'head']

# Option 2: Standard Scaling (zero mean and unit variance)
standard_scaler = StandardScaler()
data[numerical_features] = standard_scaler.fit_transform(data[numerical_features])

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine text and lemma columns into a single string representation (if needed)
data['text_lemma'] = data['text'] + " " + data['lemma']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Adjust max_features as needed

# Fit and transform the combined text and lemma
tfidf_features = tfidf_vectorizer.fit_transform(data['text_lemma'])

# Convert the sparse matrix to a DataFrame for better integration
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Add the TF-IDF features back to the original DataFrame
data = pd.concat([data.reset_index(drop=True), tfidf_df], axis=1)

# Drop the original text and lemma columns (optional)
data = data.drop(columns=['text', 'lemma', 'text_lemma'])

In [17]:
data.head(10)

Unnamed: 0,head,start_char,end_char,y,upos_ADJ,upos_ADP,upos_ADV,upos_AUX,upos_CCONJ,upos_DET,...,ہوں,ہی,ہیں,ہے,یا,یعنی,یقینا,یہ,یہاں,یہی
0,-1.038908,-1.730812,-1.730817,0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.646062,0.0,0.0
1,-0.843632,-1.730803,-1.730799,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.038908,-1.730785,-1.730789,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.843632,-1.730775,-1.730774,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.713447,-1.73076,-1.730756,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.713447,-1.730742,-1.73074,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,-1.169092,-1.730726,-1.730722,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.973816,-1.730702,-1.730707,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,-0.973816,-1.730692,-1.730691,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-1.169092,-1.730677,-1.730676,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
